commit perl-Lingua-EN-Sentence for openSUSE:Factory

h_root Wed, 15 Apr 2015 07:26:19 -0700

Hello community,

here is the log from the commit of package perl-Lingua-EN-Sentence for 
openSUSE:Factory checked in at 2015-04-15 16:25:57
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/perl-Lingua-EN-Sentence (Old)
 and      /work/SRC/openSUSE:Factory/.perl-Lingua-EN-Sentence.new (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


Package is "perl-Lingua-EN-Sentence"

Changes:
--------
--- 
/work/SRC/openSUSE:Factory/perl-Lingua-EN-Sentence/perl-Lingua-EN-Sentence.changes
  2011-11-21 12:42:07.000000000 +0100
+++ 
/work/SRC/openSUSE:Factory/.perl-Lingua-EN-Sentence.new/perl-Lingua-EN-Sentence.changes
     2015-04-15 16:25:58.000000000 +0200
@@ -1,0 +2,18 @@
+Tue Apr 14 19:21:08 UTC 2015 - [email protected]
+
+- updated to 0.27
+   see /usr/share/doc/packages/perl-Lingua-EN-Sentence/Changes
+
+       
+  0.26 Mar 12 2015
+      - Fixed POD errors
+       - Fixed RT bug 97681, setlocale work around for Android systems
+       - Added Build.PL
+       - Added tests harness and more tests
+       - update to newer Perl idioms such as 'our' variables
+       
+  0.27 Mar 12 2015
+      - added main.t to MANIFEST
+       - added more prefixes and suffixes for people' snames, such as Mme. , 
Msgr.
+
+-------------------------------------------------------------------

Old:
----
  Lingua-EN-Sentence-0.25.tar.gz

New:
----
  Lingua-EN-Sentence-0.27.tar.gz

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Other differences:
------------------
++++++ perl-Lingua-EN-Sentence.spec ++++++
--- /var/tmp/diff_new_pack.6x3Ojy/_old  2015-04-15 16:25:58.000000000 +0200
+++ /var/tmp/diff_new_pack.6x3Ojy/_new  2015-04-15 16:25:58.000000000 +0200
@@ -1,7 +1,7 @@
 #
 # spec file for package perl-Lingua-EN-Sentence
 #
-# Copyright (c) 2011 SUSE LINUX Products GmbH, Nuernberg, Germany.
+# Copyright (c) 2015 SUSE LINUX GmbH, Nuernberg, Germany.
 #
 # All modifications and additions to the file contributed by third parties
 # remain the property of their copyright owners, unless otherwise agreed
@@ -16,21 +16,27 @@
 #
 
 
-
 Name:           perl-Lingua-EN-Sentence
-Version:        0.25
-Release:        3
-License:        GPL-1.0+ or Artistic-1.0
+Version:        0.27
+Release:        0
 %define cpan_name Lingua-EN-Sentence
-Summary:        Module for splitting text into sentences.
-Url:            http://search.cpan.org/dist/Lingua-EN-Sentence/
+Summary:        Module for splitting text into sentences
+License:        Artistic-1.0 or GPL-1.0+
 Group:          Development/Libraries/Perl
-#Source:         
http://www.cpan.org/authors/id/S/SH/SHLOMOY/Lingua-EN-Sentence-%{version}.tar.gz
-Source:         %{cpan_name}-%{version}.tar.gz
+Url:            http://search.cpan.org/dist/Lingua-EN-Sentence/
+Source:         
http://www.cpan.org/authors/id/K/KI/KIMRYAN/%{cpan_name}-%{version}.tar.gz
 BuildArch:      noarch
 BuildRoot:      %{_tmppath}/%{name}-%{version}-build
 BuildRequires:  perl
 BuildRequires:  perl-macros
+BuildRequires:  perl(Module::Build) >= 0.38
+BuildRequires:  perl(Test::More) >= 0.94
+BuildRequires:  perl(strict) >= 1.04
+BuildRequires:  perl(utf8) >= 1.09
+BuildRequires:  perl(warnings) >= 1.12
+Requires:       perl(strict) >= 1.04
+Requires:       perl(utf8) >= 1.09
+Requires:       perl(warnings) >= 1.12
 %{perl_requires}
 
 %description
@@ -59,11 +65,8 @@
 %perl_process_packlist
 %perl_gen_filelist
 
-%clean
-%{__rm} -rf %{buildroot}
-
 %files -f %{name}.files
-%defattr(644,root,root,755)
-%doc Changes
+%defattr(-,root,root,755)
+%doc Changes README
 
 %changelog

++++++ Lingua-EN-Sentence-0.25.tar.gz -> Lingua-EN-Sentence-0.27.tar.gz ++++++
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Lingua-EN-Sentence-0.25/Changes 
new/Lingua-EN-Sentence-0.27/Changes
--- old/Lingua-EN-Sentence-0.25/Changes 2002-09-24 12:28:55.000000000 +0200
+++ new/Lingua-EN-Sentence-0.27/Changes 2015-03-12 08:22:18.000000000 +0100
@@ -76,3 +76,14 @@
        - Changing the "rights" notice.
 0.25 Tue Sep 24 13:28:33 IDT 2002
        - changed the email address.
+       
+0.26 Mar 12 2015
+    - Fixed POD errors
+       - Fixed RT bug 97681, setlocale work around for Android systems
+       - Added Build.PL
+       - Added tests harness and more tests
+       - update to newer Perl idioms such as 'our' variables
+       
+0.27 Mar 12 2015
+    - added main.t to MANIFEST
+       - added more prefixes and suffixes for people' snames, such as Mme. , 
Msgr.
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Lingua-EN-Sentence-0.25/INSTALL 
new/Lingua-EN-Sentence-0.27/INSTALL
--- old/Lingua-EN-Sentence-0.25/INSTALL 2002-09-24 10:35:59.000000000 +0200
+++ new/Lingua-EN-Sentence-0.27/INSTALL 1970-01-01 01:00:00.000000000 +0100
@@ -1,6 +0,0 @@
-To install the script and create man pages use the commands:
-
-  perl Makefile.PL
-  make
-  make test
-  make install          
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Lingua-EN-Sentence-0.25/MANIFEST 
new/Lingua-EN-Sentence-0.27/MANIFEST
--- old/Lingua-EN-Sentence-0.25/MANIFEST        2002-09-24 10:35:59.000000000 
+0200
+++ new/Lingua-EN-Sentence-0.27/MANIFEST        2015-03-12 08:33:31.000000000 
+0100
@@ -1,5 +1,8 @@
 Changes
-INSTALL
+README
 MANIFEST
 Makefile.PL
 lib/Lingua/EN/Sentence.pm
+t/main.t
+META.yml
+META.json
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Lingua-EN-Sentence-0.25/META.json 
new/Lingua-EN-Sentence-0.27/META.json
--- old/Lingua-EN-Sentence-0.25/META.json       1970-01-01 01:00:00.000000000 
+0100
+++ new/Lingua-EN-Sentence-0.27/META.json       2015-03-12 08:35:06.000000000 
+0100
@@ -0,0 +1,48 @@
+{
+   "abstract" : "Split text into sentences",
+   "author" : [
+      "Shlomo Yona, Kim Ryan <kimryan at cpan org>"
+   ],
+   "dynamic_config" : 1,
+   "generated_by" : "Module::Build version 0.421",
+   "license" : [
+      "perl_5"
+   ],
+   "meta-spec" : {
+      "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec";,
+      "version" : "2"
+   },
+   "name" : "Lingua-EN-Sentence",
+   "prereqs" : {
+      "build" : {
+         "requires" : {
+            "Test::More" : "0.94"
+         }
+      },
+      "configure" : {
+         "requires" : {
+            "Module::Build" : "0.38"
+         }
+      },
+      "runtime" : {
+         "requires" : {
+            "locale" : "1",
+            "strict" : "1.04",
+            "warnings" : "1.12"
+         }
+      }
+   },
+   "provides" : {
+      "Lingua::EN::Sentence" : {
+         "file" : "lib/Lingua/EN/Sentence.pm",
+         "version" : "0.27"
+      }
+   },
+   "release_status" : "stable",
+   "resources" : {
+      "license" : [
+         "http://dev.perl.org/licenses/";
+      ]
+   },
+   "version" : "0.27"
+}
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Lingua-EN-Sentence-0.25/META.yml 
new/Lingua-EN-Sentence-0.27/META.yml
--- old/Lingua-EN-Sentence-0.25/META.yml        1970-01-01 01:00:00.000000000 
+0100
+++ new/Lingua-EN-Sentence-0.27/META.yml        2015-03-12 08:35:06.000000000 
+0100
@@ -0,0 +1,26 @@
+---
+abstract: 'Split text into sentences'
+author:
+  - 'Shlomo Yona, Kim Ryan <kimryan at cpan org>'
+build_requires:
+  Test::More: '0.94'
+configure_requires:
+  Module::Build: '0.38'
+dynamic_config: 1
+generated_by: 'Module::Build version 0.421, CPAN::Meta::Converter version 
2.142060'
+license: perl
+meta-spec:
+  url: http://module-build.sourceforge.net/META-spec-v1.4.html
+  version: '1.4'
+name: Lingua-EN-Sentence
+provides:
+  Lingua::EN::Sentence:
+    file: lib/Lingua/EN/Sentence.pm
+    version: '0.27'
+requires:
+  locale: '1'
+  strict: '1.04'
+  warnings: '1.12'
+resources:
+  license: http://dev.perl.org/licenses/
+version: '0.27'
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Lingua-EN-Sentence-0.25/Makefile.PL 
new/Lingua-EN-Sentence-0.27/Makefile.PL
--- old/Lingua-EN-Sentence-0.25/Makefile.PL     2002-09-24 10:35:59.000000000 
+0200
+++ new/Lingua-EN-Sentence-0.27/Makefile.PL     2015-03-12 00:40:18.000000000 
+0100
@@ -3,16 +3,14 @@
 # the contents of the Makefile that is written.
 WriteMakefile(
     'NAME'             => 'Lingua::EN::Sentence',
+    'ABSTRACT'  => 'Split text into sentences',
     'VERSION_FROM'     => 'lib/Lingua/EN/Sentence.pm', # finds $VERSION
-    'PREREQ_PM'                => {}, # e.g., Module::Name => 1.1
-    'LIBS'             => [''], # e.g., '-lm'
-    'DEFINE'           => '', # e.g., '-DHAVE_SOMETHING'
-    'INC'              => '', # e.g., '-I/usr/include/other'
-    'dist'             => {'COMPRESS'=>'gzip', 'SUFFIX' => 'gz'}
+    'PREREQ_PM'    =>
+    {
+        'utf8'      => 1.09,
+        'warnings'  => 1.12,
+      },    
+    'AUTHOR'        => 'Shlomo Yona, Kim Ryan',
+    'LICENSE'       => 'perl'
 );
 
-print "\n";
-print "Please take a moment to review the license.\n\n";
-print "Do a 'make' to create the Makefile\n";
-print "Do a 'make test' to test the module before installation\n";
-print "Do a 'make install' to install the module.\n"; 
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Lingua-EN-Sentence-0.25/README 
new/Lingua-EN-Sentence-0.27/README
--- old/Lingua-EN-Sentence-0.25/README  1970-01-01 01:00:00.000000000 +0100
+++ new/Lingua-EN-Sentence-0.27/README  2015-03-12 08:33:04.000000000 +0100
@@ -0,0 +1,29 @@
+NAME
+
+  Lingua::EN::Sentence 
+
+DESCRIPTION
+  
+  Module for splitting text into sentences.
+
+INSTALLATION
+
+To install this module, type the following:
+
+   perl Makefile.PL
+   make
+   make test
+   make install
+   
+   or
+   
+   perl Build.PL
+   build
+   build test
+   build install
+   
+
+MAINTAINER
+
+This project was originated by Shlomo Yona. Currently  maintained
+by Kim Ryan
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Lingua-EN-Sentence-0.25/lib/Lingua/EN/Sentence.pm 
new/Lingua-EN-Sentence-0.27/lib/Lingua/EN/Sentence.pm
--- old/Lingua-EN-Sentence-0.25/lib/Lingua/EN/Sentence.pm       2002-09-24 
12:30:11.000000000 +0200
+++ new/Lingua-EN-Sentence-0.27/lib/Lingua/EN/Sentence.pm       2015-03-12 
08:33:50.000000000 +0100
@@ -23,31 +23,51 @@
 
 =head1 DESCRIPTION
 
-The C<Lingua::EN::Sentence> module contains the function get_sentences, which 
splits text into its constituent sentences, based on a regular expression and a 
list of abbreviations (built in and given).
-
-Certain well know exceptions, such as abreviations, may cause incorrect 
segmentations.  But some of them are already integrated into this code and are 
being taken care of.  Still, if you see that there are words causing the 
get_sentences() to fail, you can add those to the module, so it notices them.
+The C<Lingua::EN::Sentence> module contains the function get_sentences, which
+splits text into its constituent sentences, based on a regular expression and a
+list of abbreviations (built in and given).
+
+Certain well know exceptions, such as abreviations, may cause incorrect
+segmentations. But some of them are already integrated into this code and are
+being taken care of. Still, if you see that there are words causing the
+get_sentences() to fail, you can add those to the module, so it notices them.
 
 =head1 ALGORITHM
 
-Basically, I use a 'brute' regular expression to split the text into 
sentences.  (Well, nothing is yet split - I just mark the end-of-sentence).  
Then I look into a set of rules which decide when an end-of-sentence is 
justified and when it's a mistake. In case of a mistake, the end-of-sentence 
mark is removed. 
-
-What are such mistakes? Cases of abbreviations, for example. I have a list of 
such abbreviations (Please see `Acronym/Abbreviations list' section), and more 
general rules (for example, the abbreviations 'i.e.' and '.e.g.' need not to be 
in the list as a special rule takes care of all single letter abbreviations).
+Basically, I use a 'brute' regular expression to split the text into sentences.
+(Well, nothing is yet split - I just mark the end-of-sentence). Then I look 
into
+a set of rules which decide when an end-of-sentence is justified and when it's 
a
+mistake. In case of a mistake, the end-of-sentence mark is removed.
+
+What are such mistakes? Cases of abbreviations, for example. I have a list of
+such abbreviations (Please see `Acronym/Abbreviations list' section), and more
+general rules (for example, the abbreviations 'i.e.' and '.e.g.' need not to be
+in the list as a special rule takes care of all single letter abbreviations).
 
 =head1 FUNCTIONS
 
-All functions used should be requested in the 'use' clause. None is exported 
by default.
+All functions used should be requested in the 'use' clause. None is exported by
+default.
+
+=over 4
 
 =item get_sentences( $text )
 
-The get sentences function takes a scalar containing ascii text as an argument 
and returns a reference to an array of sentences that the text has been split 
into.
-Returned sentences will be trimmed (beginning and end of sentence) of 
white-spaces.
-Strings with no alpha-numeric characters in them, won't be returned as 
sentences.
+The get_sentences function takes a scalar containing ascii text as an argument
+and returns a reference to an array of sentences that the text has been split
+into. Returned sentences will be trimmed (beginning and end of sentence) of
+white space. Strings with no alpha-numeric characters in them, won't be
+returned as sentences.
 
 =item add_acronyms( @acronyms )
 
-This function is used for adding acronyms not supported by this code.  Please 
see `Acronym/Abbreviations list' section for the abbreviations already 
supported by this module.
+This function is used for adding acronyms not supported by this code.Note that
+acronyms are seaerched for on a case insensive basis.
 
-=item get_acronyms(    )
+Please see`Acronym/Abbreviations list' section for the abbreviations already
+supported by this module.
+
+=item get_acronyms( )
 
 This function will return the defined list of acronyms.
 
@@ -55,16 +75,19 @@
 
 This function replaces the predefined acroym list with the given list.
 
-=item get_EOS( )
+=item get_EOS( )
 
-This function returns the value of the string used to mark the end of 
sentence. You might want to see what it is, and to make sure your text doesn't 
contain it. You can use set_EOS() to alter the end-of-sentence string to 
whatever you desire.
+This function returns the value of the string used to mark the end of sentence.
+You might want to see what it is, and to make sure your text doesn't contain 
it.
+You can use set_EOS() to alter the end-of-sentence string to whatever you
+desire.
 
 =item set_EOS( $new_EOS_string )
 
 This function alters the end-of-sentence string used to mark the end of 
sentences. 
 
 =item set_locale( $new_locale )
-Revceives language locale in the form language.country.character-set
+Receives language locale in the form language.country.character-set
 for example:
        "fr_CA.ISO8859-1"
 for Canadian French using character set ISO8859-1.
@@ -73,7 +96,9 @@
 Returns undef if got undef.
 
 
-The following will set the LC_COLLATE behaviour to Argentinian Spanish. NOTE: 
The naming and avail� ability of locales depends on your operating sys� tem. 
Please consult the perllocale manpage for how to find out which locales are 
available in your system.
+The following will set the LC_COLLATE behaviour to Argentinian Spanish.
+NOTE: The naming and availability of locales depends on your operating sysem.
+Please consult the perllocale manpage for how to find out which locales are 
available in your system.
 
 $loc = set_locale( "es_AR.ISO8859-1" );
 
@@ -81,6 +106,8 @@
 
 $loc = setlocale( LC_ALL, "es_AR.ISO8859-1" );
 
+=back
+
 =head1 Acronym/Abbreviations list
 
 You can use the get_acronyms() function to get acronyms.
@@ -90,10 +117,11 @@
 Feel free to suggest such lists. 
 
 =head1 FUTURE WORK
-[1] Object Oriented like usage
-[2] Supporting more than just English/French
-[3] Code optimization. Currently everything is RE based and not so optimized RE
-[4] Possibly use more semantic heuristics for detecting a beginning of a 
sentence
+
+       [1] Object Oriented like usage
+       [2] Supporting more than just English/French
+       [3] Code optimization. Currently everything is RE based and not so 
optimized RE
+       [4] Possibly use more semantic heuristics for detecting a beginning of 
a sentence
 
 =head1 SEE ALSO
 
@@ -103,9 +131,11 @@
 
 Shlomo Yona [email protected]
 
+Currently being maintained by Kim Ryan, kimryan at CPAN d o t org
+
 =head1 COPYRIGHT
 
-Copyright (c) 2001, 2002 Shlomo Yona. All rights reserved.
+Copyright (c) 2001-2015 Shlomo Yona. All rights reserved.
 
 This library is free software. 
 You can redistribute it and/or modify it under the same terms as Perl itself.  
@@ -124,15 +154,16 @@
 # Pragmas
 #
 #==============================================================================
-require 5.005_03;
+
 use strict;
-use POSIX qw(locale_h);
+use warnings;
+use POSIX qw(locale_h setlocale);
 #==============================================================================
 #
 # Modules
 #
 #==============================================================================
-require Exporter;
+use Exporter;
 
 #==============================================================================
 #
@@ -141,44 +172,53 @@
 #==============================================================================
 use vars qw/$VERSION @ISA @EXPORT_OK $EOS $LOC $AP $P $PAP @ABBREVIATIONS/;
 use Carp qw/cluck/;
+use English;
 
-$VERSION = '0.25';
+our $VERSION = '0.27';
 
 # LC_CTYPE now in locale "French, Canada, codeset ISO 8859-1"
-$LOC=setlocale(LC_CTYPE, "fr_CA.ISO8859-1"); 
+our $LOC;
+if ($OSNAME ne 'android') {
+       # Call POSIX function
+       $LOC=  setlocale(LC_CTYPE, "fr_CA.ISO8859-1");
+}
+
+ 
 use locale;
 
 @ISA = qw( Exporter );
-@EXPORT_OK = qw( get_sentences 
-               add_acronyms get_acronyms set_acronyms
-               get_EOS set_EOS);
-
-$EOS="\001";
-$P = q/[\.!?]/;                        ## PUNCTUATION
-$AP = q/(?:'|"|�|\)|\]|\})?/;  ## AFTER PUNCTUATION
-$PAP = $P.$AP;
-
-my @PEOPLE = ( 'jr', 'mr', 'mrs', 'ms', 'dr', 'prof', 'sr', "sens?", "reps?", 
'gov',
-               "attys?", 'supt',  'det', 'rev' );
-
-
-my @ARMY = ( 'col','gen', 'lt', 'cmdr', 'adm', 'capt', 'sgt', 'cpl', 'maj' );
-my @INSTITUTES = ( 'dept', 'univ', 'assn', 'bros' );
-my @COMPANIES = ( 'inc', 'ltd', 'co', 'corp' );
-my @PLACES = ( 'arc', 'al', 'ave', "blv?d", 'cl', 'ct', 'cres', 'dr', "expy?",
-               'dist', 'mt', 'ft',
-               "fw?y", "hwa?y", 'la', "pde?", 'pl', 'plz', 'rd', 'st', 'tce',
-               'Ala' , 'Ariz', 'Ark', 'Cal', 'Calif', 'Col', 'Colo', 'Conn',
-               'Del', 'Fed' , 'Fla', 'Ga', 'Ida', 'Id', 'Ill', 'Ind', 'Ia',
-               'Kan', 'Kans', 'Ken', 'Ky' , 'La', 'Me', 'Md', 'Is', 'Mass', 
-               'Mich', 'Minn', 'Miss', 'Mo', 'Mont', 'Neb', 'Nebr' , 'Nev',
-               'Mex', 'Okla', 'Ok', 'Ore', 'Penna', 'Penn', 'Pa'  , 'Dak',
-               'Tenn', 'Tex', 'Ut', 'Vt', 'Va', 'Wash', 'Wis', 'Wisc', 'Wy',
-               'Wyo', 'USAFA', 'Alta' , 'Man', 'Ont', 'Qu�', 'Sask', 'Yuk');
-my @MONTHS = 
('jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec','sept');
-my @MISC = ( 'vs', 'etc', 'no', 'esp' );
+@EXPORT_OK = qw( get_sentences add_acronyms get_acronyms set_acronyms get_EOS 
set_EOS set_locale);             
+
+our $EOS="\001";
 
-@ABBREVIATIONS = (@PEOPLE, @ARMY, @INSTITUTES, @COMPANIES, @PLACES, @MONTHS, 
@MISC ); 
+our $P = q/[\.!?]/;                    ## PUNCTUATION
+#$AP = q/(?:'|"|?|\)|\]|\})?/; ## AFTER PUNCTUATION Kim change
+$AP =  q/(?:'|"|\?|\)|\]|\})?/;        ## AFTER PUNCTUATION
+our $PAP = $P.$AP;
+
+my @PEOPLE = qw( mr mrs ms dr prof mme ms?gr sens? reps? gov attys? supt  det 
revd? ald);
+my @TITLE_SUFFIXES = qw(PhD jn?r sn?r esq md llb);
+my @ARMY = qw( col gen lt cmdr adm capt sgt cpl maj pte cdr);
+my @INSTITUTES = qw( dept univ assn bros);
+my @COMPANIES = qw( inc ltd co corp);
+my @PLACES =
+qw(
+       arc al ave blv?d cl ct cres dr expy? 
+       dist mt ft 
+       fw?y hwa?y la pde? pl plz rd st tce 
+       Ala  Ariz Ark Cal Calif Col Colo Conn 
+       Del Fed  Fla Ga Ida Id Ill Ind Ia 
+       Kan Kans Ken Ky  La Me Md Is Mass 
+       Mich Minn Miss Mo Mont Neb Nebr  Nev 
+       Mex Okla Ok Ore Penna Penn Pa  Dak 
+       Tenn Tex Ut Vt Va Wash Wis Wisc Wy 
+       Wyo USAFA Alta  Man Ont Qu? Sask Yuk
+);
+my @MONTHS = qw(jan feb mar apr may jun jul aug sep oct nov dec sept);
+my @MISC = qw(no esp);
+my @LATIN = qw(vs etc al ibid sic);
+
+our @ABBREVIATIONS = (@PEOPLE, @TITLE_SUFFIXES, @ARMY, @INSTITUTES, 
@COMPANIES, @PLACES, @MONTHS, @MISC, @LATIN ); 
 
 
 #==============================================================================
@@ -189,9 +229,9 @@
 
 #------------------------------------------------------------------------------
 # get_sentences - takes text input and splits it into sentences.
-# A regular expression cuts viciously the text into sentences, 
+# A regular expression viciously cuts the text into sentences, 
 # and then a list of rules (some of them consist of a list of abbreviations)
-# is applied on the marked text in order to fix end-of-sentence markings on 
+# are applied on the marked text in order to fix end-of-sentence markings in 
 # places which are not indeed end-of-sentence.
 #------------------------------------------------------------------------------
 sub get_sentences {
@@ -213,28 +253,28 @@
 }
 
 #------------------------------------------------------------------------------
-# get_acronyms - get defined list of acronyms.
+# get_acronyms - get list of defined acronyms.
 #------------------------------------------------------------------------------
 sub get_acronyms {
        return @ABBREVIATIONS;
 }
 
 #------------------------------------------------------------------------------
-# set_acronyms - run over the predefined acronyms list with your own list.
+# set_acronyms - replace the predefined acronyms list with your own list.
 #------------------------------------------------------------------------------
 sub set_acronyms {
        @ABBREVIATIONS=@_;
 }
 
 #------------------------------------------------------------------------------
-# get_EOS - get the value of the $EOS (end-of-sentence mark).
+# get_EOS - get the value of the $EOS variable (end-of-sentence mark).
 #------------------------------------------------------------------------------
 sub get_EOS {
        return $EOS;
 }
 
 #------------------------------------------------------------------------------
-# set_EOS - set the value of the $EOS (end-of-sentence mark).
+# set_EOS - set the value of the $EOS variable (end-of-sentence mark).
 #------------------------------------------------------------------------------
 sub set_EOS {
        my ($new_EOS) = @_;
@@ -246,32 +286,29 @@
 }
 
 #------------------------------------------------------------------------------
+
 # set_locale - set the value of the locale.
 #
-#              Revceives language locale in the form
-#                      language.country.character-set
-#              for example:
-#                              "fr_CA.ISO8859-1"
-#              for Canadian French using character set ISO8859-1.
+# Receieves language locale in the form
+#      language.country.character-set
+# for example:
+#      "fr_CA.ISO8859-1"
+# for Canadian French using character set ISO8859-1.
 #
-#              Returns a reference to a hash containing the current locale 
-#              formatting values.
-#              Returns undef if got undef.
+# Returns a reference to a hash containing the current locale formatting 
values.
+# Returns undef if got undef.
 #
+#      The following will set the LC_ALL behaviour to Argentinian Spanish.
+#      NOTE: The naming and availability of locales depends on your operating 
system.
+#      Please consult the perllocale manpage for how to find out which locales 
are
+#      available in your system.
 #
-#               The following will set the LC_COLLATE behaviour to
-#               Argentinian Spanish. NOTE: The naming and avail�
-#               ability of locales depends on your operating sys�
-#               tem. Please consult the perllocale manpage for how
-#               to find out which locales are available in your
-#               system.
+#              $loc = set_locale( "es_AR.ISO8859-1" );
 #
-#                       $loc = set_locale( "es_AR.ISO8859-1" );
+# This actually does this:
 #
-#
-#              This actually does this:
-#
-#                      $loc = setlocale( LC_ALL, "es_AR.ISO8859-1" );
+#      $loc = setlocale( LC_ALL, "es_AR.ISO8859-1" ); # NOTE, but actually 
does LC_CTYPE, should be LC_COLLATE?
+
 #------------------------------------------------------------------------------
 sub set_locale {
        my ($new_locale) = @_;
@@ -279,8 +316,15 @@
                cluck "Won't set locale to undefined value!\n";
                return undef;
        }
-       $LOC = setlocale(LC_CTYPE, $new_locale); 
-       return $LOC;
+       
+       if ($OSNAME ne 'android') {
+               # Call POSIX function
+               $LOC = setlocale(LC_CTYPE, $new_locale);
+               return $LOC;
+       }
+       else {
+               return undef;   
+       }       
 }
 
 
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Lingua-EN-Sentence-0.25/lib/Lingua/EN/Sentence.pm~ 
new/Lingua-EN-Sentence-0.27/lib/Lingua/EN/Sentence.pm~
--- old/Lingua-EN-Sentence-0.25/lib/Lingua/EN/Sentence.pm~      2002-09-24 
12:30:02.000000000 +0200
+++ new/Lingua-EN-Sentence-0.27/lib/Lingua/EN/Sentence.pm~      1970-01-01 
01:00:00.000000000 +0100
@@ -1,378 +0,0 @@
-package Lingua::EN::Sentence;
-
-#==============================================================================
-#
-# Start of POD
-#
-#==============================================================================
-
-=head1 NAME
-
-Lingua::EN::Sentence - Module for splitting text into sentences.
-
-=head1 SYNOPSIS
-
-       use Lingua::EN::Sentence qw( get_sentences add_acronyms );
-
-       add_acronyms('lt','gen');               ## adding support for 'Lt. Gen.'
-       my $sentences=get_sentences($text);     ## Get the sentences.
-       foreach my $sentence (@$sentences) {
-               ## do something with $sentence
-       }
-
-
-=head1 DESCRIPTION
-
-The C<Lingua::EN::Sentence> module contains the function get_sentences, which 
splits text into its constituent sentences, based on a regular expression and a 
list of abbreviations (built in and given).
-
-Certain well know exceptions, such as abreviations, may cause incorrect 
segmentations.  But some of them are already integrated into this code and are 
being taken care of.  Still, if you see that there are words causing the 
get_sentences() to fail, you can add those to the module, so it notices them.
-
-=head1 ALGORITHM
-
-Basically, I use a 'brute' regular expression to split the text into 
sentences.  (Well, nothing is yet split - I just mark the end-of-sentence).  
Then I look into a set of rules which decide when an end-of-sentence is 
justified and when it's a mistake. In case of a mistake, the end-of-sentence 
mark is removed. 
-
-What are such mistakes? Cases of abbreviations, for example. I have a list of 
such abbreviations (Please see `Acronym/Abbreviations list' section), and more 
general rules (for example, the abbreviations 'i.e.' and '.e.g.' need not to be 
in the list as a special rule takes care of all single letter abbreviations).
-
-=head1 FUNCTIONS
-
-All functions used should be requested in the 'use' clause. None is exported 
by default.
-
-=item get_sentences( $text )
-
-The get sentences function takes a scalar containing ascii text as an argument 
and returns a reference to an array of sentences that the text has been split 
into.
-Returned sentences will be trimmed (beginning and end of sentence) of 
white-spaces.
-Strings with no alpha-numeric characters in them, won't be returned as 
sentences.
-
-=item add_acronyms( @acronyms )
-
-This function is used for adding acronyms not supported by this code.  Please 
see `Acronym/Abbreviations list' section for the abbreviations already 
supported by this module.
-
-=item get_acronyms(    )
-
-This function will return the defined list of acronyms.
-
-=item set_acronyms( @my_acronyms )
-
-This function replaces the predefined acroym list with the given list.
-
-=item get_EOS( )
-
-This function returns the value of the string used to mark the end of 
sentence. You might want to see what it is, and to make sure your text doesn't 
contain it. You can use set_EOS() to alter the end-of-sentence string to 
whatever you desire.
-
-=item set_EOS( $new_EOS_string )
-
-This function alters the end-of-sentence string used to mark the end of 
sentences. 
-
-=item set_locale( $new_locale )
-Revceives language locale in the form language.country.character-set
-for example:
-       "fr_CA.ISO8859-1"
-for Canadian French using character set ISO8859-1.
-
-Returns a reference to a hash containing the current locale formatting values.
-Returns undef if got undef.
-
-
-The following will set the LC_COLLATE behaviour to Argentinian Spanish. NOTE: 
The naming and avail� ability of locales depends on your operating sys� tem. 
Please consult the perllocale manpage for how to find out which locales are 
available in your system.
-
-$loc = set_locale( "es_AR.ISO8859-1" );
-
-This actually does this:
-
-$loc = setlocale( LC_ALL, "es_AR.ISO8859-1" );
-
-=head1 Acronym/Abbreviations list
-
-You can use the get_acronyms() function to get acronyms.
-It has become too long to specify in the documentation.
-
-If I come across a good general-purpose list - I'll incorporate it into this 
module.
-Feel free to suggest such lists. 
-
-=head1 FUTURE WORK
-[1] Object Oriented like usage
-[2] Supporting more than just English/French
-[3] Code optimization. Currently everything is RE based and not so optimized RE
-[4] Possibly use more semantic heuristics for detecting a beginning of a 
sentence
-
-=head1 SEE ALSO
-
-       Text::Sentence
-
-=head1 AUTHOR
-
-Shlomo Yona [email protected]
-
-=head1 COPYRIGHT
-
-Copyright (c) 2001, 2002 Shlomo Yona. All rights reserved.
-
-This library is free software. 
-You can redistribute it and/or modify it under the same terms as Perl itself.  
-
-=cut
-
-#==============================================================================
-#
-# End of POD
-#
-#==============================================================================
-
-
-#==============================================================================
-#
-# Pragmas
-#
-#==============================================================================
-require 5.005_03;
-use strict;
-use POSIX qw(locale_h);
-#==============================================================================
-#
-# Modules
-#
-#==============================================================================
-require Exporter;
-
-#==============================================================================
-#
-# Public globals
-#
-#==============================================================================
-use vars qw/$VERSION @ISA @EXPORT_OK $EOS $LOC $AP $P $PAP @ABBREVIATIONS/;
-use Carp qw/cluck/;
-
-$VERSION = '0.25';
-
-# LC_CTYPE now in locale "French, Canada, codeset ISO 8859-1"
-$LOC=setlocale(LC_CTYPE, "fr_CA.ISO8859-1"); 
-use locale;
-
-@ISA = qw( Exporter );
-@EXPORT_OK = qw( get_sentences 
-               add_acronyms get_acronyms set_acronyms
-               get_EOS set_EOS);
-
-$EOS="\001";
-$P = q/[\.!?]/;                        ## PUNCTUATION
-$AP = q/(?:'|"|�|\)|\]|\})?/;  ## AFTER PUNCTUATION
-$PAP = $P.$AP;
-
-my @PEOPLE = ( 'jr', 'mr', 'mrs', 'ms', 'dr', 'prof', 'sr', "sens?", "reps?", 
'gov',
-               "attys?", 'supt',  'det', 'rev' );
-
-
-my @ARMY = ( 'col','gen', 'lt', 'cmdr', 'adm', 'capt', 'sgt', 'cpl', 'maj' );
-my @INSTITUTES = ( 'dept', 'univ', 'assn', 'bros' );
-my @COMPANIES = ( 'inc', 'ltd', 'co', 'corp' );
-my @PLACES = ( 'arc', 'al', 'ave', "blv?d", 'cl', 'ct', 'cres', 'dr', "expy?",
-               'dist', 'mt', 'ft',
-               "fw?y", "hwa?y", 'la', "pde?", 'pl', 'plz', 'rd', 'st', 'tce',
-               'Ala' , 'Ariz', 'Ark', 'Cal', 'Calif', 'Col', 'Colo', 'Conn',
-               'Del', 'Fed' , 'Fla', 'Ga', 'Ida', 'Id', 'Ill', 'Ind', 'Ia',
-               'Kan', 'Kans', 'Ken', 'Ky' , 'La', 'Me', 'Md', 'Is', 'Mass', 
-               'Mich', 'Minn', 'Miss', 'Mo', 'Mont', 'Neb', 'Nebr' , 'Nev',
-               'Mex', 'Okla', 'Ok', 'Ore', 'Penna', 'Penn', 'Pa'  , 'Dak',
-               'Tenn', 'Tex', 'Ut', 'Vt', 'Va', 'Wash', 'Wis', 'Wisc', 'Wy',
-               'Wyo', 'USAFA', 'Alta' , 'Man', 'Ont', 'Qu�', 'Sask', 'Yuk');
-my @MONTHS = 
('jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec','sept');
-my @MISC = ( 'vs', 'etc', 'no', 'esp' );
-
-@ABBREVIATIONS = (@PEOPLE, @ARMY, @INSTITUTES, @COMPANIES, @PLACES, @MONTHS, 
@MISC ); 
-
-
-#==============================================================================
-#
-# Public methods
-#
-#==============================================================================
-
-#------------------------------------------------------------------------------
-# get_sentences - takes text input and splits it into sentences.
-# A regular expression cuts viciously the text into sentences, 
-# and then a list of rules (some of them consist of a list of abbreviations)
-# is applied on the marked text in order to fix end-of-sentence markings on 
-# places which are not indeed end-of-sentence.
-#------------------------------------------------------------------------------
-sub get_sentences {
-       my ($text)=@_;
-       return [] unless defined $text;
-       my $marked_text = first_sentence_breaking($text);
-       my $fixed_marked_text = remove_false_end_of_sentence($marked_text);
-       $fixed_marked_text = split_unsplit_stuff($fixed_marked_text);
-       my @sentences = split(/$EOS/,$fixed_marked_text);
-       my $cleaned_sentences = clean_sentences(\@sentences);
-       return $cleaned_sentences;
-}
-
-#------------------------------------------------------------------------------
-# add_acronyms - user can add a list of acronyms/abbreviations.
-#------------------------------------------------------------------------------
-sub add_acronyms {
-       push @ABBREVIATIONS, @_;
-}
-
-#------------------------------------------------------------------------------
-# get_acronyms - get defined list of acronyms.
-#------------------------------------------------------------------------------
-sub get_acronyms {
-       return @ABBREVIATIONS;
-}
-
-#------------------------------------------------------------------------------
-# set_acronyms - run over the predefined acronyms list with your own list.
-#------------------------------------------------------------------------------
-sub set_acronyms {
-       @ABBREVIATIONS=@_;
-}
-
-#------------------------------------------------------------------------------
-# get_EOS - get the value of the $EOS (end-of-sentence mark).
-#------------------------------------------------------------------------------
-sub get_EOS {
-       return $EOS;
-}
-
-#------------------------------------------------------------------------------
-# set_EOS - set the value of the $EOS (end-of-sentence mark).
-#------------------------------------------------------------------------------
-sub set_EOS {
-       my ($new_EOS) = @_;
-       if (not defined $new_EOS) {
-               cluck "Won't set \$EOS to undefined value!\n";
-               return $EOS;
-       }
-       return $EOS = $new_EOS;
-}
-
-#------------------------------------------------------------------------------
-# set_locale - set the value of the locale.
-#
-#              Revceives language locale in the form
-#                      language.country.character-set
-#              for example:
-#                              "fr_CA.ISO8859-1"
-#              for Canadian French using character set ISO8859-1.
-#
-#              Returns a reference to a hash containing the current locale 
-#              formatting values.
-#              Returns undef if got undef.
-#
-#
-#               The following will set the LC_COLLATE behaviour to
-#               Argentinian Spanish. NOTE: The naming and avail�
-#               ability of locales depends on your operating sys�
-#               tem. Please consult the perllocale manpage for how
-#               to find out which locales are available in your
-#               system.
-#
-#                       $loc = set_locale( "es_AR.ISO8859-1" );
-#
-#
-#              This actually does this:
-#
-#                      $loc = setlocale( LC_ALL, "es_AR.ISO8859-1" );
-#------------------------------------------------------------------------------
-sub set_locale {
-       my ($new_locale) = @_;
-       if (not defined $new_locale) {
-               cluck "Won't set locale to undefined value!\n";
-               return undef;
-       }
-       $LOC = setlocale(LC_CTYPE, $new_locale); 
-       return $LOC;
-}
-
-
-#==============================================================================
-#
-# Private methods
-#
-#==============================================================================
-
-## Please email me any suggestions for optimizing these RegExps.
-sub remove_false_end_of_sentence {
-       my ($marked_segment) = @_;
-##     ## don't do u.s.a.
-##     $marked_segment=~s/(\.\w$PAP)$EOS/$1/sg; 
-       $marked_segment=~s/([^-\w]\w$PAP\s)$EOS/$1/sg;
-       $marked_segment=~s/([^-\w]\w$P)$EOS/$1/sg;         
-
-       # don't plit after a white-space followed by a single letter followed
-       # by a dot followed by another whitespace.
-       $marked_segment=~s/(\s\w\.\s+)$EOS/$1/sg; 
-
-       # fix: bla bla... yada yada
-       $marked_segment=~s/(\.\.\. )$EOS([[:lower:]])/$1$2/sg; 
-       # fix "." "?" "!"
-       $marked_segment=~s/(['"]$P['"]\s+)$EOS/$1/sg;
-       ## fix where abbreviations exist
-       foreach (@ABBREVIATIONS) { $marked_segment=~s/(\b$_$PAP\s)$EOS/$1/isg; }
-       
-       # don't break after quote unless its a capital letter.
-       $marked_segment=~s/(["']\s*)$EOS(\s*[[:lower:]])/$1$2/sg;
-
-       # don't break: text . . some more text.
-       $marked_segment=~s/(\s\.\s)$EOS(\s*)/$1$2/sg;
-
-       $marked_segment=~s/(\s$PAP\s)$EOS/$1/sg;
-       return $marked_segment;
-}
-
-sub split_unsplit_stuff {
-       my ($text) = @_;
-
-       $text=~s/(\D\d+)($P)(\s+)/$1$2$EOS$3/sg;
-       $text=~s/($PAP\s)(\s*\()/$1$EOS$2/gs;
-       $text=~s/('\w$P)(\s)/$1$EOS$2/gs;
-
-
-       $text=~s/(\sno\.)(\s+)(?!\d)/$1$EOS$2/gis;
-
-##     # split where single capital letter followed by dot makes sense to 
break.
-##     # notice these are exceptions to the general rule NOT to split on single
-##     # letter.
-##     # notice also that sibgle letter M is missing here, due to French 
'mister'
-##     # which is representes as M.
-##     #
-##     # the rule will not split on names begining or containing 
-##     # single capital letter dot in the first or second name
-##     # assuming 2 or three word name.
-##     
$text=~s/(\s[[:lower:]]\w+\s+[^[[:^upper:]M]\.)(?!\s+[[:upper:]]\.)/$1$EOS/sg;
-
-
-       # add EOS when you see "a.m." or "p.m." followed by a capital letter.
-       $text=~s/([ap]\.m\.\s+)([[:upper:]])/$1$EOS$2/gs;
-
-       return $text;
-}
-
-sub clean_sentences {
-       my ($sentences) = @_;
-               my $cleaned_sentences;
-               foreach my $s (@$sentences) {
-                       next if not defined $s;
-                       next if $s!~m/\w+/;
-                       $s=~s/^\s*//;
-                       $s=~s/\s*$//;
-##                     $s=~s/\s+/ /g;
-                       push @$cleaned_sentences,$s;
-               }
-       return $cleaned_sentences;
-}
-
-sub first_sentence_breaking {
-       my ($text) = @_;
-       $text=~s/\n\s*\n/$EOS/gs;       ## double new-line means a different 
sentence.
-       $text=~s/($PAP\s)/$1$EOS/gs;
-       $text=~s/(\s\w$P)/$1$EOS/gs; # breake also when single letter comes 
before punc.
-       return $text;
-}
-
-#==============================================================================
-#
-# Return TRUE
-#
-#==============================================================================
-
-1;
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Lingua-EN-Sentence-0.25/t/main.t 
new/Lingua-EN-Sentence-0.27/t/main.t
--- old/Lingua-EN-Sentence-0.25/t/main.t        1970-01-01 01:00:00.000000000 
+0100
+++ new/Lingua-EN-Sentence-0.27/t/main.t        2015-03-12 01:04:16.000000000 
+0100
@@ -0,0 +1,35 @@
+#------------------------------------------------------------------------------
+# Test script for Lingua::EN::::Sentence.pm
+#
+# Author      : Kim Ryan, 
+# Last update : 2015-03-10
+#------------------------------------------------------------------------------
+
+use warnings;
+use strict;
+use Test::More tests => 3;
+
+BEGIN {
+
+  # does it load properly?
+  require_ok('Lingua::EN::Sentence');
+}
+use Lingua::EN::Sentence qw( get_sentences add_acronyms get_acronyms);
+
+my $par = q{
+Returns the number of sentences in string. A sentence ends with a dot
+exclamtion or question mark followed by a space! Dots in abbreviations such as 
e.g. are ignored, as well as common
+abbreviations such as Dr. Ms. esp. Apr. Calif. and Ave. insitals such as 'Mr. 
A. Smith' plus more?
+Dots in muti word abrreviatons such as U.S.A are also ignored.
+This string has 4 sentences.
+};
+
+my $sentences=get_sentences($par);     
+is( @$sentences, 5,'sub sentence_count');
+
+$par .= 'Now add an acronym, such as qld. for Queensland to the paragraph.';
+add_acronyms('Qld');
+$sentences=get_sentences($par);
+is( @$sentences, 6,'sub add_acronyms');
+
+
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Lingua-EN-Sentence-0.25/t/test_nothing.t 
new/Lingua-EN-Sentence-0.27/t/test_nothing.t
--- old/Lingua-EN-Sentence-0.25/t/test_nothing.t        2002-09-24 
10:35:59.000000000 +0200
+++ new/Lingua-EN-Sentence-0.27/t/test_nothing.t        1970-01-01 
01:00:00.000000000 +0100
@@ -1,7 +0,0 @@
-
-BEGIN { $| = 1; print "1..1\n"; }
-use Lingua::EN::Sentence (get_sentences);
-$loaded = 1;
-print "ok 1\n";
-END {print "not ok 1\n" unless $loaded;}
-

commit perl-Lingua-EN-Sentence for openSUSE:Factory

Reply via email to