http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java ---------------------------------------------------------------------- diff --git a/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java new file mode 100644 index 0000000..7341a02 --- /dev/null +++ b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.morfologik.tagdict; + +import static org.junit.Assert.*; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.nio.file.Path; + +import opennlp.morfologik.builder.POSDictionayBuilderTest; +import opennlp.tools.postag.POSModel; +import opennlp.tools.postag.POSSample; +import opennlp.tools.postag.POSTaggerFactory; +import opennlp.tools.postag.POSTaggerME; +import opennlp.tools.postag.TagDictionary; +import opennlp.tools.postag.WordTagSampleStream; +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.TrainingParameters; +import opennlp.tools.util.model.ModelType; + +import org.junit.Test; + +/** + * Tests for the {@link POSTaggerFactory} class. + */ +public class POSTaggerFactoryTest { + + private static ObjectStream<POSSample> createSampleStream() + throws IOException { + InputStream in = POSTaggerFactoryTest.class.getClassLoader() + .getResourceAsStream("AnnotatedSentences.txt"); + + return new WordTagSampleStream((new InputStreamReader(in))); + } + + static POSModel trainPOSModel(ModelType type, POSTaggerFactory factory) + throws IOException { + return POSTaggerME.train("en", createSampleStream(), + TrainingParameters.defaultParams(), factory); + } + + @Test + public void testPOSTaggerWithCustomFactory() throws Exception { + + Path dictionary = POSDictionayBuilderTest.createMorfologikDictionary(); + POSTaggerFactory inFactory = new MorfologikPOSTaggerFactory(); + TagDictionary inDict = inFactory.createTagDictionary(dictionary.toFile()); + inFactory.setTagDictionary(inDict); + + POSModel posModel = trainPOSModel(ModelType.MAXENT, inFactory); + + POSTaggerFactory factory = posModel.getFactory(); + assertTrue(factory.getTagDictionary() instanceof MorfologikTagDictionary); + + factory = null; + + ByteArrayOutputStream out = new ByteArrayOutputStream(); + posModel.serialize(out); + ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray()); + + POSModel fromSerialized = new POSModel(in); + + factory = fromSerialized.getFactory(); + assertTrue(factory.getTagDictionary() instanceof MorfologikTagDictionary); + + assertEquals(2, factory.getTagDictionary().getTags("casa").length); + } + +} \ No newline at end of file
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/test/resources/AnnotatedSentences.txt ---------------------------------------------------------------------- diff --git a/opennlp-morfologik-addon/src/test/resources/AnnotatedSentences.txt b/opennlp-morfologik-addon/src/test/resources/AnnotatedSentences.txt new file mode 100644 index 0000000..b40be87 --- /dev/null +++ b/opennlp-morfologik-addon/src/test/resources/AnnotatedSentences.txt @@ -0,0 +1,136 @@ +Last_JJ September_NNP ,_, I_PRP tried_VBD to_TO find_VB out_RP the_DT address_NN of_IN an_DT old_JJ school_NN friend_NN whom_WP I_PRP had_VBD not_RB seen_VBN for_IN 15_CD years_NNS ._. +I_PRP just_RB knew_VBD his_PRP$ name_NN ,_, Alan_NNP McKennedy_NNP ,_, and_CC I_PRP 'd_MD heard_VBD the_DT rumour_NN that_IN he_PRP 'd_MD moved_VBD to_TO Scotland_NNP ,_, the_DT country_NN of_IN his_PRP$ ancestors_NNS ._. +So_IN I_PRP called_VBD Julie_NNP ,_, a_DT friend_NN who's_WDT still_RB in_IN contact_NN with_IN him_PRP ._. +She_PRP told_VBD me_PRP that_IN he_PRP lived_VBD in_IN 23213_CD Edinburgh_NNP ,_, Worcesterstreet_NNP 12_CD ._. +I_PRP wrote_VBD him_PRP a_DT letter_NN right_RB away_RB and_CC he_PRP answered_VBD soon_RB ,_, sounding_VBG very_RB happy_JJ and_CC delighted_JJ ._. + +Last_JJ year_NN ,_, I_PRP wanted_VBD to_TO write_VB a_DT letter_NN to_TO my_PRP$ grandaunt_NN ._. +Her_PRP$ 86_CD th_NN birthday_NN was_VBD on_IN October_NNP 6_CD ,_, and_CC I_PRP no_RB longer_RB wanted_VBD to_TO be_VB hesitant_JJ to_TO get_VB in_IN touch_NN with_IN her_PRP ._. +I_PRP did_VBD not_RB know_VB her_PRP face-to-face_RB ,_, and_CC so_RB it_PRP was_VBD not_RB easy_JJ for_IN me_PRP to_TO find_VB out_RP her_PRP$ address_NN ._. +As_IN she_PRP had_VBD two_CD apartments_NNS in_IN different_JJ countries_NNS ,_, I_PRP decided_VBD to_TO write_VB to_TO both_DT ._. +The_DT first_JJ was_VBD in_IN 12424_CD Paris_NNP in_IN Rue-de-Grandes-Illusions_NNP 5_CD ._. +But_CC Marie_NNP Clara_NNP ,_, as_IN my_PRP$ aunt_NN is_VBZ called_VBN ,_, prefered_VBN her_PRP$ apartment_NN in_IN Berlin_NNP ._. +It_PRP 's_VBZ postcode_JJ is_VBZ 30202_CD ._. +She_PRP lived_VBD there_RB ,_, in_IN beautiful_JJ Kaiserstra�e_NNP 13_CD ,_, particulary_NN in_IN summer_NN ._. + +Hi_UH my_PRP$ name_NN is_VBZ Stefanie_NNP Schmidt_NNP ,_, how_WRB much_RB is_VBZ a_DT taxi_NN from_IN Ostbahnhof_NNP to_TO Hauptbahnhof_NNP ?_. +About_IN 10_CD Euro_NNP ,_, I_PRP reckon_VBP ._. +That_DT sounds_VBZ good_JJ ._. +So_RB please_VB call_VB a_DT driver_NN to_TO Leonardstra�e_NNP 112_CD ,_, near_IN the_DT Ostbahnhof_NNP in_IN 56473_CD Hamburg_NNP ._. +I_PRP 'd_MD like_VB to_TO be_VB at_IN Silberhornstra�e_NNP 12_CD as_RB soon_RB as_IN possible_JJ ._. +Thank_VB you_PRP very_RB much_RB !_. + +Hi_NNP Mike_NNP ,_, it_PRP 's_VBZ Stefanie_NNP Schmidt_NNP ._. +I_PRP 'm_VBP in_IN N�rnberg_NNP at_IN the_DT moment_NN and_CC I_PRP 've_VBP got_VBD the_DT problem_NN that_IN my_PRP$ bike_NN has_VBZ broken_VBN ._. +Could_MD you_PRP please_VB pick_VB me_PRP up_RP from_IN Seidlstra�e_NNP 56_CD ,_, I_PRP 'm_VBP in_IN the_DT Caf�_NNP "Mondnacht"_NNP at_IN the_DT moment_NN ._. +Please_VB hurry_VB up_RB ,_, I_PRP need_VBP to_TO be_VB back_RB in_IN Ulm_NNP at_IN 8_CD p.m._NN !_. + +My_PRP$ husband_NN George_NNP and_CC me_PRP recently_RB celebrated_VBD our_PRP$ 10_CD th_JJ wedding_NN anniversary_NN ._. +We_PRP got_VBD married_VBN on_IN March_NNP 11_CD ,_, 1995_CD ._. +Therefore_RB ,_, we_PRP found_VBD a_DT photo_NN album_NN with_IN pictures_NNS of_IN our_PRP$ first_JJ own_JJ apartment_NN ,_, which_WDT was_VBD in_IN 81234_CD Munich_NNP ._. +As_IN a_DT young_JJ married_JJ couple_NN ,_, we_PRP did_VBD not_RB have_VB enough_JJ money_NN to_TO afford_VB a_DT bigger_JJR lodge_NN than_IN this_DT one_CD in_IN Blumenweg_NNP 1_CD ._. +But_CC only_RB five_CD years_NNS later_RB ,_, my_PRP$ husband_NN was_VBD offered_VBN a_DT well-payed_JJ job_NN in_IN 17818_CD Hamburg_NNP ,_, so_IN we_PRP moved_VBD there_RB ._. +Since_IN then_RB ,_, our_PRP$ guests_NNS have_VBP to_TO ring_VB at_IN Veilchenstra�e_NNP 11_CD if_IN they_PRP want_VBP to_TO visit_VB us_PRP ,_, Luise_NNP and_CC George_NNP Bauer_NNP ._. + +I_PRP read_VBD your_PRP$ help-wanted_JJ ad_NN with_IN great_JJ attention_NN ._. +I_PRP 'm_VBP a_DT student_NN of_IN informatics_NNS ,_, 6th_JJ semester,_NN and_CC I_PRP 'm_VBP very_RB interested_VBN in_IN your_PRP$ part-time_JJ job_NN offer_NN ._. +I_PRP have_VBP a_DT competent_JJ knowledge_NN of_IN programming_NN and_CC foreign_JJ languages_NNS ,_, like_IN French_JJ and_CC Italian_JJ ._. +I_PRP 'm_VBP looking_VBG forward_RB to_TO your_PRP$ reply_NN ._. + +Alisa_NNP Fernandes_NNP ,_, a_DT tourist_NN from_IN Spain_NNP ,_, went_VBD to_TO the_DT reception_NN desk_NN of_IN the_DT famous_JJ Highfly-Hotel_NNP in_IN 30303_CD Berlin_NNP ._. +As_IN she_PRP felt_VBD quite_RB homesick_JJ ,_, she_PRP asked_VBD the_DT staff_NN if_IN they_PRP knew_VBD a_DT good_JJ Spanish_JJ restaurant_NN in_IN Berlin_NNP ._. +The_DT concierge_NN told_VBD her_PRP to_TO go_VB to_TO the_DT "Tapasbar"_NN in_IN Chesterstr._NNP 2_CD ._. +Alisa_NNP appreciated_VBD the_DT hint_NN and_CC enjoyed_VBD a_DT delicious_JJ traditional_JJ meal_NN ._. + +An_DT old_JJ friend_NN from_IN France_NNP is_VBZ currently_RB travelling_VBG around_IN Europe_NNP ._. +Yesterday_NN ,_, she_PRP arrived_VBD in_IN Berlin_NNP and_CC we_PRP met_VBD up_RP spontaneously_RB ._. +She_PRP wanted_VBD me_PRP to_TO show_VB her_PRP some_DT famous_JJ sights_NNS ,_, like_IN the_DT Brandenburger_NNP Tor_NNP and_CC the_DT Reichstag_NNP ._. +But_CC it_PRP was_VBD not_RB easy_JJ to_TO meet_VB up_RP in_IN the_DT city_NN because_IN she_PRP hardly_RB knows_VBZ any_DT streetname_NN or_CC building_NN ._. +So_IN I_PRP proposed_VBD to_TO meet_VB at_IN a_DT quite_RB local_JJ point:_NN the_DT caf�_NN "Daily's"_NN in_IN Unter-den-Linden_NNP 18,_CD 30291_CD Berlin_NNP ._. +It_PRP is_VBZ five_CD minutes_NNS away_RB from_IN the_DT underground_JJ station_NN "Westbad"_NN ._. +She_PRP found_VBD it_PRP instantly_RB and_CC we_PRP spent_VBD a_DT great_JJ day_NN in_IN the_DT capital_NN ._. + +Where_WRB did_VBD you_PRP get_VB those_DT great_JJ shoes_NNS ?_. +They_PRP look_VBP amazing_JJ ,_, I_PRP love_VBP the_DT colour_NN ._. +Are_VBP they_PRP made_VBN of_IN leather_NN ?_. +No,_NNP that_DT 's_VBZ faked_VBN ._. +But_CC anyway_RB ,_, I_PRP like_VBP them_PRP too_RB ._. +I_PRP got_VBD them_PRP from_IN Hamburg._NNP +Do_VBP not_RB you_PRP know_VB the_DT famous_JJ shop_NN in_IN Veilchenstra�e_NNP ?_. +It_PRP 's_VBZ called_VBN "Twentytwo"_NNP ._. +I_PRP 've_VBP never_RB heard_VBN of_IN that_DT before_RB ._. +Could_MD you_PRP give_VB me_PRP the_DT complete_JJ address_NN ?_. +Sure_JJ ,_, it_PRP 's_VBZ in_IN Veilchenstra�e_NNP 12_CD ,_, in_IN 78181_CD Hamburg_NNP ._. +I_PRP deem_VBP it_PRP best_RB to_TO write_VB a_DT letter_NN to_TO the_DT owner_NN if_IN the_DT shoes_NNS are_VBP still_RB available_JJ ._. +His_PRP$ name_NN is_VBZ Gerhard_NNP Fritsch_NNP ._. + +Hi_UH ,_, am_VBP I_PRP talking_VBG to_TO the_DT inquiries_NNS ?_. +My_PRP$ name_NN is_VBZ Mike_NNP Sander_NNP and_CC I_PRP 'd_MD like_VB to_TO know_VB if_IN it_PRP is_VBZ possible_JJ to_TO get_VB information_NN about_IN an_DT address_NN if_IN I_PRP merely_RB know_VBP the_DT name_NN and_CC the_DT phone_NN number_NN of_IN a_DT person_NN !_. +How_WRB is_VBZ he_PRP or_CC she_PRP called_VBD ?_. +His_PRP$ name_NN is_VBZ Stefan_NNP Miller_NNP and_CC his_PRP$ number_NN is_VBZ the_DT 030/827234_CD ._. +I'll_NNP have_VBP a_DT look_NN in_IN the_DT computer..._NN +I_PRP found_VBD a_DT Stefan_NNP Miller_NNP who_WP lives_VBZ in_IN Leipzig._NNP +Is_VBZ that_DT right_NN ?_. +Yes_UH ,_, it_PRP definitely_RB is_VBZ ._. +So_RB Stefan_NNP Miller_NNP lives_VBZ in_IN Heinrich-Heine-Stra�e_NNP 112_CD ,_, in_IN 20193_CD Leipzig_NNP ._. +Thank_VB you_PRP very_RB much_RB for_IN the_DT information_NN ._. +Bye_NNP !_. + +On_IN July_NNP 14_CD ,_, the_DT father_NN of_IN a_DT family_NN got_VBD painfully_RB injured_VBN after_IN he_PRP had_VBD tried_VBN to_TO start_VB a_DT barbecue_NN ._. +The_DT flaring_VBG flames_NNS burnt_VBP instantly_RB through_IN his_PRP$ jacket_NN ,_, which_WDT he_PRP managed_VBD to_TO pull_VB off_RP last-minute_JJ ._. +Although_IN the_DT wounds_NNS were_VBD n't_RB life-threatening_JJ ,_, it_PRP was_VBD urgent_JJ to_TO bring_VB him_PRP directly_RB into_IN ambulance_NN ._. +But_CC the_DT only_JJ hospital_NN that_WDT had_VBD opened_VBN that_IN Sunday_NNP was_VBD the_DT Paracelsus_NNP Hospital_NNP in_IN 83939_CD Weilheim_NNP ,_, which_WDT was_VBD 2_CD hours_NNS away_RB ._. +Convulsed_JJ with_IN pain_NN ,_, the_DT man_NN finally_RB arrived_VBD in_IN Stifterstra�e_NNP 15_CD ,_, where_WRB the_DT personal_NN immediately_RB took_VBD care_NN of_IN him_PRP ._. + +Last_JJ year_NN ,_, I_PRP worked_VBD as_IN a_DT delivery_NN boy_NN for_IN a_DT small_JJ local_JJ magazine_NN ._. +I_PRP worked_VBD in_IN the_DT area_NN of_IN 83454_CD Ottobrunn_NNP ._. +I_PRP had_VBD a_DT list_NN with_IN the_DT home_NN addresses_NNS of_IN our_PRP$ costumers_NNS whom_WP I_PRP brought_VBD their_PRP$ papers_NNS once_RB a_DT week_NN ._. +An_DT elderly_JJ lady_NN ,_, who_WP was_VBD called_VBN Elenor_NNP Meier_NNP ,_, lived_VBD in_IN G�rtnerweg_NNP 6_CD ,_, and_CC I_PRP always_RB drove_VBD there_RB first_RB ,_, because_IN I_PRP liked_VBD her_PRP the_DT most_JJS ._. +Afterwards_RB ,_, I_PRP went_VBD to_TO a_DT student_NN ,_, Gina_NNP Schneider_NNP ,_, who_WP lived_VBD still_RB in_IN her_PRP$ parent's_NNS house_NN in_IN G�rtnerweg_NNP 25_CD ._. +The_DT last_JJ in_IN line_NN was_VBD the_DT retired_JJ teacher_NN Bruno_NNP Schulz_NNP in_IN Dramenstra�e_NNP 15_CD ._. +He_PRP was_VBD friendly_JJ enough_RB to_TO tip_VB sometimes_RB ._. + +Our_PRP$ business_NN company_NN was_VBD founded_VBN in_IN 1912_CD by_IN the_DT singer_NN and_CC entertainer_NN Michel_NNP Seile_NNP ._. +He_PRP opened_VBD the_DT first_JJ agency_NN in_IN Erding_NNP ,_, a_DT small_JJ town_NN near_IN Munich_NNP ._. +Now_RB ,_, more_JJR than_IN 90_CD years_NNS of_IN turbulent_JJ ups_NNS and_CC downs_NNS later_RB ,_, we_PRP finally_RB decided_VBD to_TO situate_VB our_PRP$ company_NN in_IN a_DT more_JJR central_JJ and_CC frequented_JJ area_NN ._. +Last_JJ year_NN ,_, we_PRP moved_VBD into_IN an_DT empty_JJ factory_NN building_NN in_IN 30303_CD Berlin_NNP ._. +It_PRP is_VBZ located_VBN in_IN Barmerstr._NNP 34_CD ._. + +When_WRB George_NNP Miller_NNP ,_, a_DT tourist_NN from_IN England_NNP ,_, came_VBD to_TO Munich_NNP ,_, he_PRP had_VBD no_DT idea_NN how_WRB to_TO read_VB the_DT city_NN maps_NNS ._. +He_PRP depended_VBD completely_RB on_IN the_DT help_NN and_CC information_NN of_IN German_JJ pedestrians_NNS ._. +One_CD day_NN ,_, he_PRP simply_RB could_MD not_RB find_VB the_DT famous_JJ Lenbachhaus_NNP ._. +So_RB he_PRP asked_VBD a_DT young_JJ woman_NN for_IN help_NN ._. +She_PRP pointed_VBD at_IN a_DT street_NN sign_NN and_CC explained_VBD to_TO him_PRP that_IN he_PRP 'd_MD find_VB the_DT Lenbachhaus_NNP in_IN Luisenstra�e_NNP 33_CD ,_, which_WDT is_VBZ in_IN 80333_CD Munich_NNP ._. +Miller_NNP was_VBD very_RB grateful_JJ and_CC could_MD finally_RB enjoy_VB the_DT exhibition_NN ._. + +On_IN March_NNP 15_CD ,_, there_EX was_VBD an_DT accident_NN near_IN Munich_NNP ._. +The_DT driver_NN got_VBD badly_RB injured_VBN ._. +Driving_VBG alone_RB not_RB far_RB from_IN her_PRP$ home_NN ,_, the_DT middle-aged_JJ woman_NN crashed_VBD at_IN high_JJ speed_NN into_IN a_DT tree_NN ._. +A_DT resident_NN ,_, who_WP lives_VBZ near_IN the_DT street_NN where_WRB the_DT accident_NN took_VBD place_NN ,_, called_VBN instantly_RB the_DT police_NN ._. +He_PRP reported_VBD what_WP had_VBD happened_VBN and_CC gave_VBD his_PRP$ name_NN and_CC address_NN to_TO the_DT officer_NN ._. +He_PRP 's_VBZ called_VBN Peter_NNP Schubert_NNP and_CC he_PRP lives_VBZ at_IN Max-L�w-Stra�e_NNP 13_CD in_IN 84630_CD Gauting_NNP ._. +The_DT police_NN arrived_VBD ten_CD minutes_NNS later_RB and_CC brought_VBD the_DT woman_NN into_IN hospital_NN ._. +Although_IN she_PRP had_VBD multiple_JJ trauma_NN ,_, she_PRP 's_VBZ out_IN of_IN mortal_JJ danger_NN ._. + +Hi_NNP ,_, how_WRB are_VBP you_PRP ?_. +Are_VBP nt't_RB you_PRP a_DT friend_NN of_IN Natalie_NNP ?_. +Yeah_UH for_IN sure_JJ ._. +How_WRB did_VBD you_PRP know_VB that_DT ?_. +I_PRP saw_VBD you_PRP sitting_VBG next_JJ to_TO her_PRP at_IN uni_JJ ._. +Yeah_NNP she_PRP 's_VBZ my_PRP$ best_JJS friend_NN ._. +Are_VBP you_PRP going_VBG to_TO her_PRP party_NN next_JJ friday_NN ?_. +Oh_UH yes_UH ,_, I_PRP 'd_MD really_RB like_VB to_TO ._. +But_CC in_IN fact_NN I_PRP do_VBP n't_RB know_VB yet_RB where_WRB it_PRP takes_VBZ place_NN ._. +I_PRP can_MD tell_VB you_PRP :_: ring_NN at_IN Baumann,_NNP Meisenstra�e_NNP 5_CD ,_, in_IN 81737_CD Munich_NNP ._. +The_DT party_NN starts_VBZ at_IN 9_CD p.m._NN ._. +I_PRP hope_VBP you_PRP 'll_MD find_VB it_PRP ._. +Thank_VB you_PRP very_RB much_RB ,_, see_VBP you_PRP next_JJ friday_NN !_. + +My_PRP$ name_NN is_VBZ Michael_NNP Hinterhofer_NNP ._. +When_WRB I_PRP was_VBD 21_CD ,_, I_PRP moved_VBD out_RP from_IN my_PRP$ parents_NNS home_NN into_IN my_PRP$ first_JJ own_JJ appartment_NN in_IN order_NN to_TO study_VB in_IN a_DT bigger_JJR city_NN ._. +My_PRP$ new_JJ home_NN was_VBD in_IN Lilienstra�e_NNP 1_CD in_IN 25334_CD Hamburg_NNP ._. +But_CC I_PRP realized_VBD quickly_RB that_IN life_NN in_IN a_DT metropolis_NN was_VBD n't_RB relaxed_VBN enough_RB for_IN me_PRP ._. +So_IN I_PRP decided_VBD to_TO move_VB into_IN a_DT smaller_JJR town_NN ._. +Now_RB I_PRP 'm_VBP a_DT tenant_NN with_IN an_DT elderly_JJ widow_NN ._. +We_PRP live_VBP in_IN B�rgerstra�e_NNP 2_CD in_IN 63737_CD Heidelberg_NNP ._. +I_PRP really_RB like_IN the_DT smalltown_JJ flair_NN and_CC my_PRP$ studies_NNS at_IN Heidelberg_NNP 's_POS notable_JJ university_NN ._. \ No newline at end of file http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.info ---------------------------------------------------------------------- diff --git a/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.info b/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.info new file mode 100644 index 0000000..ad5fe8d --- /dev/null +++ b/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.info @@ -0,0 +1,15 @@ +# +# REQUIRED PROPERTIES +# + +# Column (lemma, inflected, tag) separator. This must be a single byte in the target encoding. +fsa.dict.separator=, + +# The charset in which the input is encoded. UTF-8 is strongly recommended. +fsa.dict.encoding=UTF-8 + +# The type of lemma-inflected form encoding compression that precedes automaton +# construction. Allowed values: [suffix, infix, prefix, none]. +# Details are in Daciuk's paper and in the code. +# Leave at 'prefix' if not sure. +fsa.dict.encoder=prefix \ No newline at end of file http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.txt ---------------------------------------------------------------------- diff --git a/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.txt b/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.txt new file mode 100644 index 0000000..09d39e3 --- /dev/null +++ b/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.txt @@ -0,0 +1,11 @@ +casa,casa,NOUN +casar,casa,V +casar,casar,V-INF +Casa,Casa,PROP +casa,casinha,NOUN +casa,casona,NOUN +menino,menina,NOUN +menino,menino,NOUN +menino,meninão,NOUN +menino,menininho,NOUN +carro,carro,NOUN \ No newline at end of file http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml deleted file mode 100644 index 56d0e47..0000000 --- a/pom.xml +++ /dev/null @@ -1,109 +0,0 @@ -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" - xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> - <modelVersion>4.0.0</modelVersion> - - <groupId>org.apache.opennlp</groupId> - <artifactId>morfologik-addon</artifactId> - <version>1.0-SNAPSHOT</version> - <packaging>jar</packaging> - <name>Morfologik Addon</name> - - <url>http://maven.apache.org</url> - <build> - <plugins> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-compiler-plugin</artifactId> - <version>2.3.2</version> - <configuration> - <source>1.7</source> - <target>1.7</target> - </configuration> - </plugin> - <plugin> - <artifactId>maven-assembly-plugin</artifactId> - <executions> - <execution> - <id>bundle-project-sources</id> - <phase>package</phase> - <goals> - <goal>single</goal> - </goals> - <configuration> - <descriptors> - <descriptor>src/main/assembly/bin.xml</descriptor> - <descriptor>src/main/assembly/src.xml</descriptor> - </descriptors> - <!-- Tar package is only compatible with gnu tar, - many file have more than 100 chars. - Right now only javadoc files are too long. - --> - <tarLongFileMode>gnu</tarLongFileMode> - - <finalName>apache-opennlp-morfologik-addon-${project.version}</finalName> - </configuration> - </execution> - </executions> - </plugin> - <plugin> - <artifactId>maven-antrun-plugin</artifactId> - <version>1.6</version> - <executions> - <execution> - <id>generate checksums for binary artifacts</id> - <goals><goal>run</goal></goals> - <phase>verify</phase> - <configuration> - <target> - <checksum algorithm="sha1" format="MD5SUM"> - <fileset dir="${project.build.directory}"> - <include name="*.zip" /> - <include name="*.gz" /> - </fileset> - </checksum> - <checksum algorithm="md5" format="MD5SUM"> - <fileset dir="${project.build.directory}"> - <include name="*.zip" /> - <include name="*.gz" /> - </fileset> - </checksum> - </target> - </configuration> - </execution> - </executions> - </plugin> - </plugins> - </build> - <properties> - <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> - </properties> - - <dependencies> - <dependency> - <groupId>org.carrot2</groupId> - <artifactId>morfologik-stemming</artifactId> - <version>2.1.0</version> - <scope>compile</scope> - </dependency> - <dependency> - <groupId>org.carrot2</groupId> - <artifactId>morfologik-tools</artifactId> - <version>2.1.0</version> - <scope>compile</scope> - </dependency> - - <dependency> - <groupId>org.apache.opennlp</groupId> - <artifactId>opennlp-tools</artifactId> - <version>1.6.0</version> - </dependency> - - <dependency> - <groupId>junit</groupId> - <artifactId>junit</artifactId> - <version>4.8.1</version> - <scope>test</scope> - </dependency> - - </dependencies> -</project> http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/assembly/bin.xml ---------------------------------------------------------------------- diff --git a/src/main/assembly/bin.xml b/src/main/assembly/bin.xml deleted file mode 100644 index ab4f6da..0000000 --- a/src/main/assembly/bin.xml +++ /dev/null @@ -1,91 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. ---> - -<assembly> - <id>bin</id> - <formats> - <format>tar.gz</format> - <format>zip</format> - <format>dir</format> - </formats> - - <includeBaseDirectory>true</includeBaseDirectory> - <baseDirectory>/apache-opennlp-morfologik-addon-${project.version}</baseDirectory> - - <dependencySets> - <dependencySet> - <scope>runtime</scope> - <unpack>false</unpack> - <useProjectArtifact>false</useProjectArtifact> - <fileMode>644</fileMode> - <directoryMode>755</directoryMode> - <outputDirectory>lib</outputDirectory> - <useTransitiveDependencies>true</useTransitiveDependencies> - </dependencySet> - </dependencySets> - - <fileSets> - <fileSet> - <directory>src/main/readme</directory> - <outputDirectory></outputDirectory> - <fileMode>644</fileMode> - <directoryMode>755</directoryMode> - </fileSet> - - <fileSet> - <directory>.</directory> - <outputDirectory></outputDirectory> - <filtered>true</filtered> - <fileMode>644</fileMode> - <directoryMode>755</directoryMode> - <includes> - <include>README</include> - <include>RELEASE_NOTES.html</include> - </includes> - </fileSet> - - <fileSet> - <directory>target</directory> - <outputDirectory></outputDirectory> - <fileMode>644</fileMode> - <directoryMode>755</directoryMode> - <includes> - <include>issuesFixed/**</include> - </includes> - </fileSet> - - <fileSet> - <directory>src/main/bin</directory> - <fileMode>755</fileMode> - <directoryMode>755</directoryMode> - <outputDirectory>bin</outputDirectory> - </fileSet> - - <fileSet> - <directory>target</directory> - <outputDirectory>lib</outputDirectory> - <includes> - <include>morfologik-addon-*.jar</include> - </includes> - </fileSet> - - </fileSets> -</assembly> http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/assembly/src.xml ---------------------------------------------------------------------- diff --git a/src/main/assembly/src.xml b/src/main/assembly/src.xml deleted file mode 100644 index cdcc9d3..0000000 --- a/src/main/assembly/src.xml +++ /dev/null @@ -1,39 +0,0 @@ -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<assembly> - <id>src</id> - <formats> - <format>tar.gz</format> - <format>zip</format> - </formats> - - <baseDirectory>/apache-opennlp-${project.version}-src</baseDirectory> - - <fileSets> - <fileSet> - <directory>../</directory> - <outputDirectory></outputDirectory> - <excludes> - <exclude>**/target/**</exclude> - <exclude>**/.*/**</exclude> - <exclude>**/pom.xml.releaseBackup</exclude> - <exclude>**/release.properties</exclude> - </excludes> - </fileSet> - </fileSets> -</assembly> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/bin/morfologik-addon ---------------------------------------------------------------------- diff --git a/src/main/bin/morfologik-addon b/src/main/bin/morfologik-addon deleted file mode 100755 index 9b0faf9..0000000 --- a/src/main/bin/morfologik-addon +++ /dev/null @@ -1,35 +0,0 @@ -#!/bin/sh - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Note: Do not output anything in this script file, any output -# may be inadvertantly placed in any output files if -# output redirection is used. - -if [ -z "$JAVACMD" ] ; then - if [ -n "$JAVA_HOME" ] ; then - JAVACMD="$JAVA_HOME/bin/java" - else - JAVACMD="`which java`" - fi -fi - -# Might fail if $0 is a link -OPENNLP_HOME=`dirname "$0"`/.. - -$JAVACMD -Xmx1024m -cp "lib/*" opennlp.morfologik.cmdline.CLI $@ http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/bin/morfologik-addon.bat ---------------------------------------------------------------------- diff --git a/src/main/bin/morfologik-addon.bat b/src/main/bin/morfologik-addon.bat deleted file mode 100644 index aeec31f..0000000 --- a/src/main/bin/morfologik-addon.bat +++ /dev/null @@ -1,47 +0,0 @@ -@ECHO off - -REM # Licensed to the Apache Software Foundation (ASF) under one -REM # or more contributor license agreements. See the NOTICE file -REM # distributed with this work for additional information -REM # regarding copyright ownership. The ASF licenses this file -REM # to you under the Apache License, Version 2.0 (the -REM # "License"); you may not use this file except in compliance -REM # with the License. You may obtain a copy of the License at -REM # -REM # http://www.apache.org/licenses/LICENSE-2.0 -REM # -REM # Unless required by applicable law or agreed to in writing, -REM # software distributed under the License is distributed on an -REM # # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -REM # KIND, either express or implied. See the License for the -REM # specific language governing permissions and limitations -REM # under the License. - -REM # Note: Do not output anything in this script file, any output -REM # may be inadvertantly placed in any output files if -REM # output redirection is used. -SETLOCAL - -IF "%JAVA_CMD%" == "" ( - IF "%JAVA_HOME%" == "" ( - SET JAVA_CMD=java - ) ELSE ( - REM # Keep JAVA_HOME to short-name without spaces - FOR %%A IN ("%JAVA_HOME%") DO SET JAVA_CMD=%%~sfA\bin\java - ) -) - -REM # Should work with Windows XP and greater. If not, specify the path to where it is installed. -IF "%OPENNLP_HOME%" == "" ( - SET OPENNLP_HOME=%~sp0.. -) ELSE ( - REM # Keep OPENNLP_HOME to short-name without spaces - FOR %%A IN ("%OPENNLP_HOME%") DO SET OPENNLP_HOME=%%~sfA -) - -REM # Get the library JAR file name (JIRA OPENNLP-554) -FOR %%A IN ("%OPENNLP_HOME%\lib\*.jar") DO SET JAR_FILE=%%A - -%JAVA_CMD% -Xmx1024m -jar %JAR_FILE% %* - -ENDLOCAL \ No newline at end of file http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/bin/opennlp-cp ---------------------------------------------------------------------- diff --git a/src/main/bin/opennlp-cp b/src/main/bin/opennlp-cp deleted file mode 100755 index dff0d12..0000000 --- a/src/main/bin/opennlp-cp +++ /dev/null @@ -1,35 +0,0 @@ -#!/bin/sh - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Note: Do not output anything in this script file, any output -# may be inadvertantly placed in any output files if -# output redirection is used. - -if [ -z "$JAVACMD" ] ; then - if [ -n "$JAVA_HOME" ] ; then - JAVACMD="$JAVA_HOME/bin/java" - else - JAVACMD="`which java`" - fi -fi - -# Might fail if $0 is a link -OPENNLP_HOME=`dirname "$0"`/.. - -$JAVACMD -Xmx1024m -cp "lib/*" opennlp.tools.cmdline.CLI $@ http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java ---------------------------------------------------------------------- diff --git a/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java b/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java deleted file mode 100644 index dbbca4d..0000000 --- a/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package opennlp.morfologik.builder; - -import java.io.FileNotFoundException; -import java.io.IOException; -import java.nio.charset.Charset; -import java.nio.file.Path; -import java.util.Properties; - -import morfologik.stemming.DictionaryMetadata; -import morfologik.stemming.EncoderType; -import morfologik.tools.DictCompile; - -/** - * Utility class to build Morfologik dictionaries from a tab separated values - * file. The first column is the word, the second its lemma and the third a POS - * tag. If there is no lemma information leave the second column empty. - */ -public class MorfologikDictionayBuilder { - - /** - * Helper to compile a morphological dictionary automaton. - * - * @param input - * The input file (base,inflected,tag). An associated metadata - * (*.info) file must exist. - * @param overwrite - * Overwrite the output file if it exists. - * @param validate - * Validate input to make sure it makes sense. - * @param acceptBom - * Accept leading BOM bytes (UTF-8). - * @param acceptCr - * Accept CR bytes in input sequences (\r). - * @param ignoreEmpty - * Ignore empty lines in the input. - * @return the dictionary path - * - * @throws Exception - */ - public Path build(Path input, boolean overwrite, boolean validate, - boolean acceptBom, boolean acceptCr, boolean ignoreEmpty) - throws Exception { - - DictCompile compiler = new DictCompile(input, overwrite, validate, - acceptBom, acceptCr, ignoreEmpty); - compiler.call(); - - - Path metadataPath = DictionaryMetadata - .getExpectedMetadataLocation(input); - - return metadataPath.resolveSibling( - metadataPath.getFileName().toString().replaceAll( - "\\." + DictionaryMetadata.METADATA_FILE_EXTENSION + "$", ".dict")); - } - - /** - * Helper to compile a morphological dictionary automaton using default - * parameters. - * - * @param input - * The input file (base,inflected,tag). An associated metadata - * (*.info) file must exist. - * - * @return the dictionary path - * - * @throws Exception - */ - public Path build(Path input) throws Exception { - - return build(input, true, true, false, false, false); - - } - - Properties createProperties(Charset encoding, String separator, - EncoderType encoderType) throws FileNotFoundException, IOException { - - Properties properties = new Properties(); - properties.setProperty("fsa.dict.separator", separator); - properties.setProperty("fsa.dict.encoding", encoding.name()); - properties.setProperty("fsa.dict.encoder", encoderType.name()); - - return properties; - - } -} http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/java/opennlp/morfologik/cmdline/CLI.java ---------------------------------------------------------------------- diff --git a/src/main/java/opennlp/morfologik/cmdline/CLI.java b/src/main/java/opennlp/morfologik/cmdline/CLI.java deleted file mode 100644 index f92d178..0000000 --- a/src/main/java/opennlp/morfologik/cmdline/CLI.java +++ /dev/null @@ -1,164 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package opennlp.morfologik.cmdline; - -import java.util.Collections; -import java.util.LinkedHashMap; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import opennlp.morfologik.cmdline.builder.MorfologikDictionaryBuilderTool; -import opennlp.morfologik.cmdline.builder.XMLDictionaryToTableTool; -import opennlp.tools.cmdline.BasicCmdLineTool; -import opennlp.tools.cmdline.CmdLineTool; -import opennlp.tools.cmdline.StreamFactoryRegistry; -import opennlp.tools.cmdline.TerminateToolException; -import opennlp.tools.cmdline.TypedCmdLineTool; -import opennlp.tools.util.Version; - -public final class CLI { - - public static final String CMD = "opennlp-morfologik-addon"; - - private static Map<String, CmdLineTool> toolLookupMap; - - static { - toolLookupMap = new LinkedHashMap<String, CmdLineTool>(); - - List<CmdLineTool> tools = new LinkedList<CmdLineTool>(); - - tools.add(new MorfologikDictionaryBuilderTool()); - tools.add(new XMLDictionaryToTableTool()); - - for (CmdLineTool tool : tools) { - toolLookupMap.put(tool.getName(), tool); - } - - toolLookupMap = Collections.unmodifiableMap(toolLookupMap); - } - - /** - * @return a set which contains all tool names - */ - public static Set<String> getToolNames() { - return toolLookupMap.keySet(); - } - - private static void usage() { - System.out.print("OpenNLP Morfologik Addon " - + Version.currentVersion().toString() + ". "); - System.out.println("Usage: " + CMD + " TOOL"); - System.out.println("where TOOL is one of:"); - - // distance of tool name from line start - int numberOfSpaces = -1; - for (String toolName : toolLookupMap.keySet()) { - if (toolName.length() > numberOfSpaces) { - numberOfSpaces = toolName.length(); - } - } - numberOfSpaces = numberOfSpaces + 4; - - for (CmdLineTool tool : toolLookupMap.values()) { - - System.out.print(" " + tool.getName()); - - for (int i = 0; i < Math.abs(tool.getName().length() - - numberOfSpaces); i++) { - System.out.print(" "); - } - - System.out.println(tool.getShortDescription()); - } - - System.out - .println("All tools print help when invoked with help parameter"); - System.out - .println("Example: opennlp-morfologik-addon POSDictionaryBuilder help"); - } - - - @SuppressWarnings("rawtypes") - public static void main(String[] args) { - - if (args.length == 0) { - usage(); - System.exit(0); - } - - String toolArguments[] = new String[args.length -1]; - System.arraycopy(args, 1, toolArguments, 0, toolArguments.length); - - String toolName = args[0]; - - //check for format - String formatName = StreamFactoryRegistry.DEFAULT_FORMAT; - int idx = toolName.indexOf("."); - if (-1 < idx) { - formatName = toolName.substring(idx + 1); - toolName = toolName.substring(0, idx); - } - CmdLineTool tool = toolLookupMap.get(toolName); - - try { - if (null == tool) { - throw new TerminateToolException(1, "Tool " + toolName + " is not found."); - } - - if ((0 == toolArguments.length && tool.hasParams()) || - 0 < toolArguments.length && "help".equals(toolArguments[0])) { - if (tool instanceof TypedCmdLineTool) { - System.out.println(((TypedCmdLineTool) tool).getHelp(formatName)); - } else if (tool instanceof BasicCmdLineTool) { - System.out.println(tool.getHelp()); - } - - System.exit(0); - } - - if (tool instanceof TypedCmdLineTool) { - ((TypedCmdLineTool) tool).run(formatName, toolArguments); - } else if (tool instanceof BasicCmdLineTool) { - if (-1 == idx) { - ((BasicCmdLineTool) tool).run(toolArguments); - } else { - throw new TerminateToolException(1, "Tool " + toolName + " does not support formats."); - } - } else { - throw new TerminateToolException(1, "Tool " + toolName + " is not supported."); - } - } - catch (TerminateToolException e) { - - if (e.getMessage() != null) { - System.err.println(e.getMessage()); - } - - if (e.getCause() != null) { - System.err.println(e.getCause().getMessage()); - e.getCause().printStackTrace(System.err); - } - - System.exit(e.getCode()); - } - } - - -} http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java ---------------------------------------------------------------------- diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java deleted file mode 100644 index 5ea2e4f..0000000 --- a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package opennlp.morfologik.cmdline.builder; - -import java.io.File; - -import opennlp.tools.cmdline.ArgumentParser.OptionalParameter; -import opennlp.tools.cmdline.ArgumentParser.ParameterDescription; -import opennlp.tools.cmdline.params.EncodingParameter; - -/** - * Params for Dictionary tools. - */ -interface MorfologikDictionaryBuilderParams extends EncodingParameter { - - @ParameterDescription(valueName = "in", description = "The input file (base,inflected,tag). An associated metadata (*.info) file must exist.") - File getInputFile(); - - @ParameterDescription(valueName = "true|false", description = "Accept leading BOM bytes (UTF-8).") - @OptionalParameter(defaultValue="false") - Boolean getAcceptBOM(); - - @ParameterDescription(valueName = "true|false", description = "Accept CR bytes in input sequences (\r).") - @OptionalParameter(defaultValue="false") - Boolean getAcceptCR(); - - @ParameterDescription(valueName = "FSA5|CFSA2", description = "Automaton serialization format.") - @OptionalParameter(defaultValue="FSA5") - String getFormat(); - - @ParameterDescription(valueName = "true|false", description = "Ignore empty lines in the input.") - @OptionalParameter(defaultValue="false") - Boolean getIgnoreEmpty(); - - @ParameterDescription(valueName = "true|false", description = "Overwrite the output file if it exists.") - @OptionalParameter(defaultValue="false") - Boolean getOverwrite(); - - @ParameterDescription(valueName = "true|false", description = "Validate input to make sure it makes sense.") - @OptionalParameter(defaultValue="false") - Boolean getValidate(); -} http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java ---------------------------------------------------------------------- diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java deleted file mode 100644 index eb9b51c..0000000 --- a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package opennlp.morfologik.cmdline.builder; - -import java.io.File; -import java.nio.file.Path; - -import morfologik.stemming.DictionaryMetadata; -import opennlp.morfologik.builder.MorfologikDictionayBuilder; -import opennlp.tools.cmdline.BasicCmdLineTool; -import opennlp.tools.cmdline.CmdLineUtil; -import opennlp.tools.cmdline.TerminateToolException; - -public class MorfologikDictionaryBuilderTool extends BasicCmdLineTool { - - interface Params extends MorfologikDictionaryBuilderParams { - } - - public String getShortDescription() { - return "builds a binary POS Dictionary using Morfologik"; - } - - public String getHelp() { - return getBasicHelp(Params.class); - } - - public void run(String[] args) { - Params params = validateAndParseParams(args, Params.class); - - File dictInFile = params.getInputFile(); - - CmdLineUtil.checkInputFile("dictionary input file", dictInFile); - Path metadataPath = DictionaryMetadata.getExpectedMetadataLocation(dictInFile.toPath()); - CmdLineUtil.checkInputFile("dictionary metadata (.info) input file", metadataPath.toFile()); - - MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder(); - try { - builder.build(dictInFile.toPath(), params.getOverwrite(), - params.getValidate(), params.getAcceptBOM(), params.getAcceptCR(), - params.getIgnoreEmpty()); - } catch (Exception e) { - throw new TerminateToolException(-1, - "Error while creating Morfologik POS Dictionay: " + e.getMessage(), e); - } - - } -} http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java ---------------------------------------------------------------------- diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java deleted file mode 100644 index 4ee8cd4..0000000 --- a/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package opennlp.morfologik.cmdline.builder; - -import java.io.File; - -import opennlp.tools.cmdline.ArgumentParser.OptionalParameter; -import opennlp.tools.cmdline.ArgumentParser.ParameterDescription; -import opennlp.tools.cmdline.params.EncodingParameter; - -/** - * Params for Dictionary tools. - */ -interface XMLDictionaryToTableParams extends EncodingParameter { - - @ParameterDescription(valueName = "in", description = "OpenNLP XML Tag Dictionary.") - File getInputFile(); - - @ParameterDescription(valueName = "out", description = "Output for Morfologik (.info will be also created).") - File getOutputFile(); - - @ParameterDescription(valueName = "char", description = "Columm separator (must be a single character)") - @OptionalParameter(defaultValue=",") - String getSeparator(); - - @ParameterDescription(valueName = "value", description = " Type of lemma-inflected form encoding compression that precedes automaton construction. Allowed values: [suffix, infix, prefix, none].") - @OptionalParameter(defaultValue="prefix") - String getEncoder(); - -} http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java ---------------------------------------------------------------------- diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java deleted file mode 100644 index 0e7f2d5..0000000 --- a/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package opennlp.morfologik.cmdline.builder; - -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.nio.charset.Charset; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Iterator; -import java.util.Properties; - -import morfologik.stemming.DictionaryMetadata; -import opennlp.tools.cmdline.BasicCmdLineTool; -import opennlp.tools.cmdline.CmdLineUtil; -import opennlp.tools.cmdline.TerminateToolException; -import opennlp.tools.postag.POSDictionary; - -public class XMLDictionaryToTableTool extends BasicCmdLineTool { - - interface Params extends XMLDictionaryToTableParams { - } - - private String SEPARATOR; - - public String getShortDescription() { - return "reads an OpenNLP XML tag dictionary and outputs it in a tab separated file"; - } - - public String getHelp() { - return getBasicHelp(Params.class); - } - - public void run(String[] args) { - Params params = validateAndParseParams(args, Params.class); - - File dictInFile = params.getInputFile(); - File dictOutFile = params.getOutputFile(); - Charset encoding = params.getEncoding(); - SEPARATOR = params.getSeparator(); - - CmdLineUtil.checkInputFile("dictionary input file", dictInFile); - CmdLineUtil.checkOutputFile("dictionary output file", dictOutFile); - - POSDictionary tagDictionary = null; - try { - tagDictionary = POSDictionary.create(new FileInputStream(dictInFile)); - } catch (IOException e) { - throw new TerminateToolException(-1, - "Error while loading XML POS Dictionay: " + e.getMessage(), e); - } - Iterator<String> iterator = tagDictionary.iterator(); - - try (BufferedWriter writer = Files.newBufferedWriter(dictOutFile.toPath(), - encoding)) { - while (iterator.hasNext()) { - String word = iterator.next(); - for (String tag : tagDictionary.getTags(word)) { - if(valid(word,tag)) { - String entry = createEntry(word, tag); - writer.write(entry); - writer.newLine(); - } - } - } - writer.close(); - System.out.println("Created dictionary: " + dictOutFile.toPath()); - } catch (IOException e) { - throw new TerminateToolException(-1, "Error while writing output: " - + e.getMessage(), e); - } - - Properties info = new Properties(); - info.setProperty("fsa.dict.separator", SEPARATOR); - info.setProperty("fsa.dict.encoding", params.getEncoding().name()); - info.setProperty("fsa.dict.encoder", params.getEncoder()); - - Path metaPath = DictionaryMetadata.getExpectedMetadataLocation(dictOutFile.toPath()); - - try { - info.store(Files.newOutputStream(metaPath), "Info file for FSA Morfologik dictionary."); - } catch (IOException e) { - throw new TerminateToolException(-1, "Error while writing metadata output: " - + e.getMessage(), e); - } - System.out.println("Created metadata: " + dictOutFile.toPath()); - - } - - private boolean valid(String word, String tag) { - if(word.contains(SEPARATOR) || tag.contains(SEPARATOR)) { - System.out - .println("Warn: invalid entry because contains separator - word: " - + word + " tag: " + tag); - return false; - } - - return true; - } - - private String createEntry(String word, String tag) { - - String entry = "" + SEPARATOR +// base - word + SEPARATOR + - tag; - - return entry; - } - -} http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java ---------------------------------------------------------------------- diff --git a/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java b/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java deleted file mode 100644 index 2090ce5..0000000 --- a/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package opennlp.morfologik.lemmatizer; - -import java.io.IOException; -import java.nio.file.Path; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Set; - -import morfologik.stemming.Dictionary; -import morfologik.stemming.DictionaryLookup; -import morfologik.stemming.IStemmer; -import morfologik.stemming.WordData; -import opennlp.tools.lemmatizer.DictionaryLemmatizer; - -public class MorfologikLemmatizer implements DictionaryLemmatizer { - - private IStemmer dictLookup; - public final Set<String> constantTags = new HashSet<String>(Arrays.asList( - "NNP", "NP00000")); - - public MorfologikLemmatizer(Path dictionaryPath) throws IllegalArgumentException, - IOException { - dictLookup = new DictionaryLookup(Dictionary.read(dictionaryPath)); - } - - private HashMap<List<String>, String> getLemmaTagsDict(String word) { - List<WordData> wdList = dictLookup.lookup(word); - HashMap<List<String>, String> dictMap = new HashMap<List<String>, String>(); - for (WordData wd : wdList) { - List<String> wordLemmaTags = new ArrayList<String>(); - wordLemmaTags.add(word); - wordLemmaTags.add(wd.getTag().toString()); - dictMap.put(wordLemmaTags, wd.getStem().toString()); - } - return dictMap; - } - - private List<String> getDictKeys(String word, String postag) { - List<String> keys = new ArrayList<String>(); - if (constantTags.contains(postag)) { - keys.addAll(Arrays.asList(word, postag)); - } else { - keys.addAll(Arrays.asList(word.toLowerCase(), postag)); - } - return keys; - } - - private HashMap<List<String>, String> getDictMap(String word, String postag) { - HashMap<List<String>, String> dictMap = new HashMap<List<String>, String>(); - - if (constantTags.contains(postag)) { - dictMap = this.getLemmaTagsDict(word); - } else { - dictMap = this.getLemmaTagsDict(word.toLowerCase()); - } - return dictMap; - } - - public String lemmatize(String word, String postag) { - String lemma = null; - List<String> keys = this.getDictKeys(word, postag); - HashMap<List<String>, String> dictMap = this.getDictMap(word, postag); - // lookup lemma as value of the map - String keyValue = dictMap.get(keys); - if (keyValue != null) { - lemma = keyValue; - } else if (keyValue == null && constantTags.contains(postag)) { - lemma = word; - } else if (keyValue == null && word.toUpperCase() == word) { - lemma = word; - } else { - lemma = word.toLowerCase(); - } - return lemma; - } -} http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java ---------------------------------------------------------------------- diff --git a/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java deleted file mode 100644 index 93d6c61..0000000 --- a/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java +++ /dev/null @@ -1,170 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package opennlp.morfologik.tagdict; - -import java.io.ByteArrayInputStream; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Map; - -import morfologik.stemming.DictionaryMetadata; -import opennlp.tools.dictionary.Dictionary; -import opennlp.tools.postag.POSTaggerFactory; -import opennlp.tools.postag.TagDictionary; -import opennlp.tools.util.InvalidFormatException; -import opennlp.tools.util.model.ArtifactSerializer; -import opennlp.tools.util.model.ModelUtil; - -public class MorfologikPOSTaggerFactory extends POSTaggerFactory { - - private static final String MORFOLOGIK_POSDICT_SUF = "morfologik_dict"; - private static final String MORFOLOGIK_DICT_INFO_SUF = "morfologik_info"; - - private static final String MORFOLOGIK_POSDICT = "tagdict." - + MORFOLOGIK_POSDICT_SUF; - private static final String MORFOLOGIK_DICT_INFO = "tagdict." - + MORFOLOGIK_DICT_INFO_SUF; - - private TagDictionary dict; - - private byte[] dictInfo; - private byte[] dictData; - - public MorfologikPOSTaggerFactory() { - } - - public TagDictionary createTagDictionary(File dictionary) - throws InvalidFormatException, FileNotFoundException, IOException { - - if(!dictionary.canRead()) { - throw new FileNotFoundException("Could not read dictionary: " + dictionary.getAbsolutePath()); - } - - Path dictionaryMeta = DictionaryMetadata.getExpectedMetadataLocation(dictionary.toPath()); - - if(dictionaryMeta == null || !dictionaryMeta.toFile().canRead()) { - throw new FileNotFoundException("Could not read dictionary metadata: " + dictionaryMeta.getFileName()); - } - - this.dictData = Files.readAllBytes(dictionary.toPath()); - this.dictInfo = Files.readAllBytes(dictionaryMeta); - - return createMorfologikDictionary(dictData, dictInfo); - - } - - - @Override - protected void init(Dictionary ngramDictionary, TagDictionary posDictionary) { - super.init(ngramDictionary, null); - this.dict = posDictionary; - } - - @Override - public TagDictionary getTagDictionary() { - if (this.dict == null) { - - if (artifactProvider != null) { - Object obj = artifactProvider.getArtifact(MORFOLOGIK_POSDICT); - if (obj != null) { - byte[] data = (byte[]) artifactProvider - .getArtifact(MORFOLOGIK_POSDICT); - byte[] info = (byte[]) artifactProvider - .getArtifact(MORFOLOGIK_DICT_INFO); - - try { - this.dict = createMorfologikDictionary(data, info); - } catch (IllegalArgumentException e) { - throw new RuntimeException( - "Could not load the dictionary files to Morfologik.", e); - } catch (IOException e) { - throw new RuntimeException( - "IO error while reading the Morfologik dictionary files.", e); - } - } - } - } - - return this.dict; - } - - @Override - public void setTagDictionary(TagDictionary dictionary) { - this.dict = dictionary; - } - - @Override - public TagDictionary createEmptyTagDictionary() { - throw new UnsupportedOperationException( - "Morfologik POS Tagger factory does not support this operation"); - } - - @Override - public TagDictionary createTagDictionary(InputStream in) - throws InvalidFormatException, IOException { - throw new UnsupportedOperationException( - "Morfologik POS Tagger factory does not support this operation"); - } - - @Override - @SuppressWarnings("rawtypes") - public Map<String, ArtifactSerializer> createArtifactSerializersMap() { - Map<String, ArtifactSerializer> serializers = super - .createArtifactSerializersMap(); - - serializers.put(MORFOLOGIK_POSDICT_SUF, new ByteArraySerializer()); - serializers.put(MORFOLOGIK_DICT_INFO_SUF, new ByteArraySerializer()); - - return serializers; - } - - @Override - public Map<String, Object> createArtifactMap() { - Map<String, Object> artifactMap = super.createArtifactMap(); - artifactMap.put(MORFOLOGIK_POSDICT, this.dictData); - artifactMap.put(MORFOLOGIK_DICT_INFO, this.dictInfo); - return artifactMap; - } - - private TagDictionary createMorfologikDictionary(byte[] data, byte[] info) - throws IOException { - morfologik.stemming.Dictionary dict = morfologik.stemming.Dictionary - .read(new ByteArrayInputStream(data), new ByteArrayInputStream( - info)); - return new MorfologikTagDictionary(dict); - } - - static class ByteArraySerializer implements ArtifactSerializer<byte[]> { - - public byte[] create(InputStream in) throws IOException, - InvalidFormatException { - - return ModelUtil.read(in); - } - - public void serialize(byte[] artifact, OutputStream out) throws IOException { - out.write(artifact); - } - } - -} http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java ---------------------------------------------------------------------- diff --git a/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java b/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java deleted file mode 100644 index b34ca2b..0000000 --- a/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package opennlp.morfologik.tagdict; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; - -import morfologik.stemming.Dictionary; -import morfologik.stemming.DictionaryLookup; -import morfologik.stemming.IStemmer; -import morfologik.stemming.WordData; -import opennlp.tools.postag.TagDictionary; - -/** - * A POS Tagger dictionary implementation based on Morfologik binary - * dictionaries - */ -public class MorfologikTagDictionary implements TagDictionary { - - private IStemmer dictLookup; - private boolean isCaseSensitive; - - /** - * Creates a case sensitive {@link MorfologikTagDictionary} - * - * @param dict - * a Morfologik FSA dictionary - * @throws IllegalArgumentException - * if FSA's root node cannot be acquired (dictionary is empty). - * @throws IOException - * could not read dictionary from dictURL - */ - public MorfologikTagDictionary(Dictionary dict) - throws IllegalArgumentException, IOException { - this(dict, true); - } - - /** - * Creates MorfologikLemmatizer - * - * @param dict - * a Morfologik FSA dictionary - * @param caseSensitive - * if true it performs case sensitive lookup - * @throws IllegalArgumentException - * if FSA's root node cannot be acquired (dictionary is empty). - * @throws IOException - * could not read dictionary from dictURL - */ - public MorfologikTagDictionary(Dictionary dict, boolean caseSensitive) - throws IllegalArgumentException, IOException { - this.dictLookup = new DictionaryLookup(dict); - this.isCaseSensitive = caseSensitive; - } - - @Override - public String[] getTags(String word) { - if (!isCaseSensitive) { - word = word.toLowerCase(); - } - - List<WordData> data = dictLookup.lookup(word); - if (data != null && data.size() > 0) { - List<String> tags = new ArrayList<String>(data.size()); - for (int i = 0; i < data.size(); i++) { - tags.add(data.get(i).getTag().toString()); - } - if (tags.size() > 0) - return tags.toArray(new String[tags.size()]); - return null; - } - return null; - } -} http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/java/opennlp/morfologik/util/MorfologikUtil.java ---------------------------------------------------------------------- diff --git a/src/main/java/opennlp/morfologik/util/MorfologikUtil.java b/src/main/java/opennlp/morfologik/util/MorfologikUtil.java deleted file mode 100644 index bd4d1a4..0000000 --- a/src/main/java/opennlp/morfologik/util/MorfologikUtil.java +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package opennlp.morfologik.util; - -import java.io.File; - -import morfologik.stemming.DictionaryMetadata; - -public class MorfologikUtil { - - public static File getExpectedPropertiesFile(File dictFile) { - return DictionaryMetadata.getExpectedMetadataLocation(dictFile.toPath()) - .toFile(); - } - - public static File getExpectedPropertiesFile(String dictFile) { - File f = new File(dictFile); - return getExpectedPropertiesFile(f); - } - -} http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/readme/LICENSE ---------------------------------------------------------------------- diff --git a/src/main/readme/LICENSE b/src/main/readme/LICENSE deleted file mode 100644 index 576b4cf..0000000 --- a/src/main/readme/LICENSE +++ /dev/null @@ -1,230 +0,0 @@ - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - -The following license applies to the Snowball stemmers: - - Copyright (c) 2001, Dr Martin Porter - Copyright (c) 2002, Richard Boulton - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * Neither the name of the copyright holders nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE - FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/readme/MORFOLOGIK-LICENSE ---------------------------------------------------------------------- diff --git a/src/main/readme/MORFOLOGIK-LICENSE b/src/main/readme/MORFOLOGIK-LICENSE deleted file mode 100644 index 0554010..0000000 --- a/src/main/readme/MORFOLOGIK-LICENSE +++ /dev/null @@ -1,28 +0,0 @@ -Copyright (c) 2006 Dawid Weiss -Copyright (c) 2007-2015 Dawid Weiss, Marcin MiÅkowski -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, -are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - - * Neither the name of Morfologik nor the names of its contributors - may be used to endorse or promote products derived from this software - without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/readme/NOTICE ---------------------------------------------------------------------- diff --git a/src/main/readme/NOTICE b/src/main/readme/NOTICE deleted file mode 100644 index 73fb1d7..0000000 --- a/src/main/readme/NOTICE +++ /dev/null @@ -1,11 +0,0 @@ -Apache OpenNLP -Copyright 2010, 2013 The Apache Software Foundation - -This product includes software developed at -The Apache Software Foundation (http://www.apache.org/). - -The snowball stemmers in -opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball -were developed by Martin Porter and Richard Boulton. -The full snowball package is available from -http://snowball.tartarus.org/
