commit perl-Lingua-EN-Sentence for openSUSE:Factory

Source-Sync Wed, 13 Jul 2022 06:35:20 -0700

Script 'mail_helper' called by obssrc
Hello community,

here is the log from the commit of package perl-Lingua-EN-Sentence for 
openSUSE:Factory checked in at 2022-07-13 15:35:04
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/perl-Lingua-EN-Sentence (Old)
 and      /work/SRC/openSUSE:Factory/.perl-Lingua-EN-Sentence.new.1523 (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


Package is "perl-Lingua-EN-Sentence"

Wed Jul 13 15:35:04 2022 rev:16 rq:988972 version:0.33

Changes:
--------
--- 
/work/SRC/openSUSE:Factory/perl-Lingua-EN-Sentence/perl-Lingua-EN-Sentence.changes
  2018-08-31 10:43:51.167160167 +0200
+++ 
/work/SRC/openSUSE:Factory/.perl-Lingua-EN-Sentence.new.1523/perl-Lingua-EN-Sentence.changes
        2022-07-13 15:35:05.333377829 +0200
@@ -1,0 +2,14 @@
+Tue Jul  5 03:06:22 UTC 2022 - Tina M??ller <timueller+p...@suse.de>
+
+- updated to 0.33
+   see /usr/share/doc/packages/perl-Lingua-EN-Sentence/Changes
+
+  0.32 July 2022
+       fixed bug causing abbreviation followed by '(' to break sentnece, 
reported in github
+       dot following an abbreviation now explicitly marked up
+       added more acronyms
+       improved documentation
+       improved tests
+       added verbose moe for debugging
+
+-------------------------------------------------------------------

Old:
----
  Lingua-EN-Sentence-0.31.tar.gz

New:
----
  Lingua-EN-Sentence-0.33.tar.gz

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Other differences:
------------------
++++++ perl-Lingua-EN-Sentence.spec ++++++
--- /var/tmp/diff_new_pack.RWUygO/_old  2022-07-13 15:35:05.885378679 +0200
+++ /var/tmp/diff_new_pack.RWUygO/_new  2022-07-13 15:35:05.889378686 +0200
@@ -1,7 +1,7 @@
 #
 # spec file for package perl-Lingua-EN-Sentence
 #
-# Copyright (c) 2018 SUSE LINUX GmbH, Nuernberg, Germany.
+# Copyright (c) 2022 SUSE LLC
 #
 # All modifications and additions to the file contributed by third parties
 # remain the property of their copyright owners, unless otherwise agreed
@@ -12,28 +12,26 @@
 # license that conforms to the Open Source Definition (Version 1.9)
 # published by the Open Source Initiative.
 
-# Please submit bugfixes or comments via http://bugs.opensuse.org/
+# Please submit bugfixes or comments via https://bugs.opensuse.org/
 #
 
 
+%define cpan_name Lingua-EN-Sentence
 Name:           perl-Lingua-EN-Sentence
-Version:        0.31
+Version:        0.33
 Release:        0
-%define cpan_name Lingua-EN-Sentence
-Summary:        Split Text Into Sentences
 License:        Artistic-1.0 OR GPL-1.0-or-later
-Group:          Development/Libraries/Perl
-Url:            http://search.cpan.org/dist/Lingua-EN-Sentence/
+Summary:        Split text into sentences
+URL:            https://metacpan.org/release/%{cpan_name}
 Source0:        
https://cpan.metacpan.org/authors/id/K/KI/KIMRYAN/%{cpan_name}-%{version}.tar.gz
 Source1:        cpanspec.yml
 BuildArch:      noarch
-BuildRoot:      %{_tmppath}/%{name}-%{version}-build
 BuildRequires:  perl
 BuildRequires:  perl-macros
 BuildRequires:  perl(Module::Build) >= 0.380000
 BuildRequires:  perl(Test::More) >= 0.94
-BuildRequires:  perl(warnings) >= 1.12
-Requires:       perl(warnings) >= 1.12
+BuildRequires:  perl(warnings) >= 1.06
+Requires:       perl(warnings) >= 1.06
 %{perl_requires}
 
 %description
@@ -45,25 +43,25 @@
 segmentations. But some of them are already integrated into this code and
 are being taken care of. Still, if you see that there are words causing the
 get_sentences function to fail, you can add those to the module, so it
-notices them.
+notices them. Note that abbreviations are case sensitive, so 'Mrs.' is
+recognised but not 'mrs.'
 
 %prep
-%setup -q -n %{cpan_name}-%{version}
+%autosetup  -n %{cpan_name}-%{version}
 
 %build
-%{__perl} Makefile.PL INSTALLDIRS=vendor
-%{__make} %{?_smp_mflags}
+perl Build.PL installdirs=vendor
+./Build build flags=%{?_smp_mflags}
 
 %check
-%{__make} test
+./Build test
 
 %install
-%perl_make_install
-%perl_process_packlist
+./Build install destdir=%{buildroot} create_packlist=0
 %perl_gen_filelist
 
 %files -f %{name}.files
-%defattr(-,root,root,755)
-%doc Changes examples LICENCE README
+%doc Changes examples README
+%license LICENCE
 
 %changelog

++++++ Lingua-EN-Sentence-0.31.tar.gz -> Lingua-EN-Sentence-0.33.tar.gz ++++++
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Lingua-EN-Sentence-0.31/Build.PL 
new/Lingua-EN-Sentence-0.33/Build.PL
--- old/Lingua-EN-Sentence-0.31/Build.PL        1970-01-01 01:00:00.000000000 
+0100
+++ new/Lingua-EN-Sentence-0.33/Build.PL        2022-07-05 03:36:50.000000000 
+0200
@@ -0,0 +1,29 @@
+use Module::Build;
+
+Module::Build -> new
+(
+ module_name    => 'Lingua::EN::Sentence',
+ license        => 'perl',
+ dist_abstract  => "Split text into sentences",
+ dist_author    => 'Shlomo Yona, Kim Ryan <kimryan at cpan org>',
+ build_requires =>
+ {
+       Test::More => 0.94,
+ },
+ configure_requires =>
+ {
+  Module::Build => 0.3800,
+ },
+ requires =>
+ {
+  'perl'   => '5.10.0',
+  warnings => '1.06'
+ },
+ meta_merge =>
+ {
+    resources =>
+    {
+            repository => 'https://github.com/kimryan/Lingua-EN-Sentence'
+    }
+  }
+) -> create_build_script();
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Lingua-EN-Sentence-0.31/Changes 
new/Lingua-EN-Sentence-0.33/Changes
--- old/Lingua-EN-Sentence-0.31/Changes 2018-08-19 10:06:56.000000000 +0200
+++ new/Lingua-EN-Sentence-0.33/Changes 2022-07-05 02:58:57.000000000 +0200
@@ -104,4 +104,13 @@
 0.31 Aug 19 2018
     Declared min version of Perl. Fix for RT bug #124686 
        
+0.32 July 2022
+       fixed bug causing abbreviation followed by '(' to break sentnece, 
reported in github
+       dot following an abbreviation now explicitly marked up
+       added more acronyms
+       improved documentation
+       improved tests
+       added verbose moe for debugging
        
+0.33 July 05 2022
+       fixed version numbersin Build.PL and Makefile.PL
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Lingua-EN-Sentence-0.31/MANIFEST 
new/Lingua-EN-Sentence-0.33/MANIFEST
--- old/Lingua-EN-Sentence-0.31/MANIFEST        2016-08-08 08:43:14.000000000 
+0200
+++ new/Lingua-EN-Sentence-0.33/MANIFEST        2022-07-04 08:13:30.000000000 
+0200
@@ -3,6 +3,7 @@
 MANIFEST
 LICENCE
 Makefile.PL
+Build.PL
 lib/Lingua/EN/Sentence.pm
 t/main.t
 examples/demo.pl
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Lingua-EN-Sentence-0.31/META.json 
new/Lingua-EN-Sentence-0.33/META.json
--- old/Lingua-EN-Sentence-0.31/META.json       2018-08-19 10:26:07.000000000 
+0200
+++ new/Lingua-EN-Sentence-0.33/META.json       2022-07-05 03:41:47.000000000 
+0200
@@ -4,13 +4,13 @@
       "Shlomo Yona, Kim Ryan <kimryan at cpan org>"
    ],
    "dynamic_config" : 1,
-   "generated_by" : "Module::Build version 0.4208",
+   "generated_by" : "Module::Build version 0.4229",
    "license" : [
       "perl_5"
    ],
    "meta-spec" : {
       "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec";,
-      "version" : "2"
+      "version" : 2
    },
    "name" : "Lingua-EN-Sentence",
    "prereqs" : {
@@ -26,15 +26,15 @@
       },
       "runtime" : {
          "requires" : {
-            "perl" : "5.1",
-            "warnings" : "1.12"
+            "perl" : "v5.10.0",
+            "warnings" : "1.06"
          }
       }
    },
    "provides" : {
       "Lingua::EN::Sentence" : {
          "file" : "lib/Lingua/EN/Sentence.pm",
-         "version" : "0.31"
+         "version" : "0.33"
       }
    },
    "release_status" : "stable",
@@ -46,6 +46,6 @@
          "url" : "https://github.com/kimryan/Lingua-EN-Sentence";
       }
    },
-   "version" : "0.31",
-   "x_serialization_backend" : "JSON::PP version 2.27300"
+   "version" : "0.33",
+   "x_serialization_backend" : "JSON::PP version 4.04"
 }
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Lingua-EN-Sentence-0.31/META.yml 
new/Lingua-EN-Sentence-0.33/META.yml
--- old/Lingua-EN-Sentence-0.31/META.yml        2018-08-19 10:26:07.000000000 
+0200
+++ new/Lingua-EN-Sentence-0.33/META.yml        2022-07-05 03:41:47.000000000 
+0200
@@ -7,7 +7,7 @@
 configure_requires:
   Module::Build: '0.38'
 dynamic_config: 1
-generated_by: 'Module::Build version 0.4208, CPAN::Meta::Converter version 
2.150005'
+generated_by: 'Module::Build version 0.4229, CPAN::Meta::Converter version 
2.150010'
 license: perl
 meta-spec:
   url: http://module-build.sourceforge.net/META-spec-v1.4.html
@@ -16,12 +16,12 @@
 provides:
   Lingua::EN::Sentence:
     file: lib/Lingua/EN/Sentence.pm
-    version: '0.31'
+    version: '0.33'
 requires:
-  perl: '5.1'
-  warnings: '1.12'
+  perl: v5.10.0
+  warnings: '1.06'
 resources:
   license: http://dev.perl.org/licenses/
   repository: https://github.com/kimryan/Lingua-EN-Sentence
-version: '0.31'
-x_serialization_backend: 'CPAN::Meta::YAML version 0.016'
+version: '0.33'
+x_serialization_backend: 'CPAN::Meta::YAML version 0.018'
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Lingua-EN-Sentence-0.31/Makefile.PL 
new/Lingua-EN-Sentence-0.33/Makefile.PL
--- old/Lingua-EN-Sentence-0.31/Makefile.PL     2016-08-08 09:09:39.000000000 
+0200
+++ new/Lingua-EN-Sentence-0.33/Makefile.PL     2022-07-05 03:35:59.000000000 
+0200
@@ -7,8 +7,9 @@
     'VERSION_FROM'     => 'lib/Lingua/EN/Sentence.pm', # finds $VERSION
     'PREREQ_PM'    =>
     {
-        'warnings'  => 1.12
-      },    
+        'perl' => '5.10.0',
+        'warnings' => '1.06'
+     },    
     'AUTHOR'        => 'Shlomo Yona, Kim Ryan',
     'LICENSE'       => 'perl',
     META_MERGE => {
@@ -16,8 +17,8 @@
      resources => {
          repository => {
              type => 'git',
-             url  => 'https://github.com/dwimperl/Task-DWIM.git',
-             web  => 'https://github.com/dwimperl/Task-DWIM',
+             url  => 'https://github.com/kimryan/Lingua-EN-Sentence.git',
+             web  => 'https://github.com/kimryan/Lingua-EN-Sentence'
          },
      },
     }        
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Lingua-EN-Sentence-0.31/examples/demo.pl 
new/Lingua-EN-Sentence-0.33/examples/demo.pl
--- old/Lingua-EN-Sentence-0.31/examples/demo.pl        2016-08-08 
08:30:56.000000000 +0200
+++ new/Lingua-EN-Sentence-0.33/examples/demo.pl        2022-07-04 
08:07:00.000000000 +0200
@@ -3,14 +3,36 @@
 Demo program of CPAN module Lingua::EN::Sentence, sentence splitter
 
 =cut
-
 use strict;
 use warnings;
+use Lingua::EN::Sentence qw( get_sentences add_acronyms  get_EOS set_EOS 
set_locale);
 
+print("Started\n");
+my $text = q{
+A sentence usually ends with a dot, exclamation or question mark optionally 
followed by a space!
+A string followed by 2 carriage returns denotes a sentence, even though it 
doesn't end in a dot
+
+Dots after single letters such as U.S.A. or in numbers like -12.34 will not 
cause a split
+as well as common abbreviations such as Dr. I. Smith, Ms. A.B. Jones, Apr. 
Calif. Esq.
+and (some text) ellipsis such as ... or . . are ignored.
+Some valid cases canot be deteected, such as the answer is X. It cannot easily 
be
+differentiated from the single letter-dot sequence to abbreviate a person's 
given name.
+Numbered points within a sentence will not cause a split 1. Like this one.
+See the code for all the rules that apply.
+This string has 7 sentences.
+};
 
-use Lingua::EN::Sentence qw( get_sentences add_acronyms  get_EOS set_EOS 
set_locale);
+my $sentences=get_sentences($text);     ## Get the sentences.
+my $num_sentences = (@$sentences);
+my $i;
+print("There are: $num_sentences sentences\n" );
+foreach my $sent (@$sentences)
+{
+    $i++;
+    print("SENTENCE $i:$sent\n");
+}
 
-my $text = q{First sentence.
+ $text = q{First sentence.
 12. point 12
 Some numbers 12.46, -.123,3:.
 Some ???utf quotes wrap this??? ???And more???};
@@ -23,22 +45,21 @@
 $text =~ s/???/"/g;
 
 # Change lines starting with numbered points from x. to x) to avoid confusion 
with dots
-$text =~ s/\n(\d{1,})./\n$1\)/g;
-
-
-print("Started\n");
-my $sentences=get_sentences($text);     ## Get the sentences.
+ $text =~ s/\n(\d{1,})./\n$1\)/g;
+ 
+ $sentences=get_sentences($text);     ## Get the sentences.
 my $num_sentences = (@$sentences);
-
-print("There are: $num_sentences sentences\n" );
-my $i;
 foreach my $sent (@$sentences)
 {
-    $i++;
-    print("SENTENCE $i >>>$sent<<<\n");
+    print("$sent\n");
 }
 
 
 
+
+
+
+
+
 
 
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Lingua-EN-Sentence-0.31/lib/Lingua/EN/Sentence.pm 
new/Lingua-EN-Sentence-0.33/lib/Lingua/EN/Sentence.pm
--- old/Lingua-EN-Sentence-0.31/lib/Lingua/EN/Sentence.pm       2018-08-19 
10:19:59.000000000 +0200
+++ new/Lingua-EN-Sentence-0.33/lib/Lingua/EN/Sentence.pm       2022-07-05 
03:02:14.000000000 +0200
@@ -13,11 +13,27 @@
 =head1 SYNOPSIS
 
        use Lingua::EN::Sentence qw( get_sentences add_acronyms );
-
+       
        add_acronyms('lt','gen');               ## adding support for 'Lt. Gen.'
-       my $sentences=get_sentences($text);     ## Get the sentences.
-       foreach my $sentence (@$sentences) {
-               ## do something with $sentence
+       my $text = q{
+       A sentence usually ends with a dot, exclamation or question mark 
optionally followed by a space!
+       A string followed by 2 carriage returns denotes a sentence, even though 
it doesn't end in a dot
+       
+       Dots after single letters such as U.S.A. or in numbers like -12.34 will 
not cause a split
+       as well as common abbreviations such as Dr. I. Smith, Ms. A.B. Jones, 
Apr. Calif. Esq.
+       and (some text) ellipsis such as ... or . . are ignored.
+       Some valid cases canot be deteected, such as the answer is X. It cannot 
easily be
+       differentiated from the single letter-dot sequence to abbreviate a 
person's given name.
+       Numbered points within a sentence will not cause a split 1. Like this 
one.
+       See the code for all the rules that apply.
+       This string has 7 sentences.
+       };
+       
+       my $sentences=get_sentences($text);     # Get the sentences.
+       foreach my $sent (@$sentences)
+       {
+               $i++;
+               print("SENTENCE $i:$sent\n");
        }
 
 
@@ -31,18 +47,32 @@
 segmentations. But some of them are already integrated into this code and are
 being taken care of. Still, if you see that there are words causing the
 get_sentences function to fail, you can add those to the module, so it notices 
them.
+Note that abbreviations are case sensitive, so 'Mrs.' is recognised but not 
'mrs.'
 
 =head1 ALGORITHM
 
+The first step is to mark  the dot ending an abbreviation by changing it to a 
special
+character. Now it won't cause a sentence split. The original dot is restored 
after
+the sentences are split
+
 Basically, I use a 'brute' regular expression to split the text into sentences.
 (Well, nothing is yet split - I just mark the end-of-sentence). Then I look 
into
 a set of rules which decide when an end-of-sentence is justified and when it's 
a
-mistake. In case of a mistake, the end-of-sentence mark is removed.
+mistake. In case of a mistake, the end-of-sentence mark is removed. What are
+such mistakes?
 
-What are such mistakes? Cases of abbreviations, for example. I have a list of
-such abbreviations (Please see public globals belwo for a list), and more
-general rules (for example, the abbreviations 'i.e.' and '.e.g.' need not to be
-in the list as a special rule takes care of all single letter abbreviations).
+Letter-dot sequences:  U.S.A. ,  i.e. , e.g.
+Dot sequences: '..' or '...'  or 'text . . more text'
+Two carriage returns denote the end of a sentence even if it doesn't end with 
a dot
+
+=head1 LIMITATIONS
+
+1) John F. Kennedy was a former president
+2) The answer is F. That ends the quiz
+
+In the first sentence, F. is detected as a persons initial and not the end of 
a sentence.
+But this means we cannot detect the true end of sentence 2, which is after the 
'F'. This
+case is not common though.
 
 =head1 FUNCTIONS
 
@@ -133,6 +163,8 @@
 =head1 SEE ALSO
 
        Text::Sentence
+       Lingua::Sentence
+       Raku port of Lingua::EN::Sentence
        
 =head1 REPOSITORY
 
@@ -148,7 +180,7 @@
 =head1 COPYRIGHT AND LICENSE
 
 Copyright (c) 2001-2016 Shlomo Yona. All rights reserved.
-Copyright (c) 2018 Kim Ryan. All rights reserved.
+Copyright (c) 2022 Kim Ryan. All rights reserved.
 
 This library is free software; you can redistribute it and/or modify
 it under the same terms as Perl itself.
@@ -187,7 +219,7 @@
 use Carp qw/cluck/;
 use English;
 
-our $VERSION = '0.31';
+our $VERSION = '0.33';
 
 our $LOC;
 if ($OSNAME ne 'android') {
@@ -201,34 +233,36 @@
 @ISA = qw( Exporter );
 @EXPORT_OK = qw( get_sentences add_acronyms get_acronyms set_acronyms get_EOS 
set_EOS set_locale);             
 
-our $EOS="\001";
+our $VERBOSE = 0; # echo intermediate data transforms, useful for debugging
+our $EOS = "\001"; #"__EOS__";
+our $EOA = '__EOA__';
 
 our $P = q/[\.!?]/;                        # PUNCTUATION
 
 $AP =  q/(?:'|"|\?|\)|\]|\})?/;        # AFTER PUNCTUATION
 our $PAP = $P.$AP;
 
-my @PEOPLE = qw( mr mrs ms dr prof mme ms?gr sens? reps? gov attys? supt insp 
const det revd? ald rt hon);
-my @TITLE_SUFFIXES = qw(PhD jn?r sn?r esq md llb);
-my @MILITARY = qw( col gen lt cdr cmdr adm capt sgt cpl maj pte);
-my @INSTITUTES = qw( dept univ assn bros);
-my @COMPANIES = qw( inc ltd co corp);
+# ACRONYMS AND ABBREVIATIONS
+my @PEOPLE = qw( Mr Mrs Ms Dr Prof Mme Ms?gr Sens? Reps? Gov Attys? Supt Insp 
Const Det Revd? Ald Rt Hon);
+my @TITLE_SUFFIXES = qw(PhD Jn?r Sn?r Esq MD LLB);
+my @MILITARY = qw( Col Gen Lt Cm?dr Adm Capt Sgt Cpl Maj Pte);
+my @INSTITUTES = qw( Dept Univ Assn Bros);
+my @COMPANIES = qw( Inc Pty Ltd Co Corp);
 my @PLACES =
 qw(
-       arc al ave blv?d cl ct cres dr expy? fw?y hwa?y la pde? pl plz rd st 
tce 
+       Arc Al Ave Blv?d Cl Ct Cres Dr Expy? Fw?y Hwa?y La Pde? Pl Plz Rd St 
Tce 
        dist mt km in ft        
        Ala  Ariz Ark Cal Calif Col Colo Conn Del Fed  Fla Ga Ida Id Ill Ind Ia 
Kan Kans Ken Ky
-       La Me Md Is Mass Mich Minn Miss Mo Mont Neb Nebr  Nev Mex Okla Ok Ore 
Penna Penn Pa  Dak 
-       Tenn Tex Ut Vt Va Wash Wis Wisc Wy Wyo USAFA Alta  Man Ont Qu? Sask Yuk
+       La Me Md Is Mass Mich Minn Miss Mo Mont Neb Nebr Nev Mex Okla Ok Ore 
Penna Penn Pa Dak 
+       Tenn Tex Ut Vt Va Wash Wis Wisc Wy Wyo USAFA Alta Man Ont Qu? Sask Yuk
        Aust Vic Qld Tas
 );
-my @MONTHS = qw(jan feb mar apr may jun jul aug sep sept oct nov dec);
+my @MONTHS = qw(Jan Feb Mar Apr May Jun Jul Aug Sept? Oct Nov Dec);
 my @MISC = qw(no esp est);  # Established
 my @LATIN = qw(vs etc al ibid sic);
+my @MATH = qw(fig eq sec cf Thm Def Conj resp);
 
-our @ABBREVIATIONS = (@PEOPLE, @TITLE_SUFFIXES, @MILITARY, @INSTITUTES, 
@COMPANIES, @PLACES, @MONTHS, @MISC, @LATIN );
-my $abbreviation_regex;
-_set_abbreviations_regex();
+our @ABBREVIATIONS = (@PEOPLE, @TITLE_SUFFIXES, @MILITARY, @INSTITUTES, 
@COMPANIES, @PLACES, @MONTHS, @MISC,@LATIN, @MATH);
 
 
 #==============================================================================
@@ -245,13 +279,31 @@
 # places which are not indeed end-of-sentence.
 #------------------------------------------------------------------------------
 sub get_sentences {
-       my ($text)=@_;
+       my ($text) = @_;
        return [] unless defined $text;
-       my $marked_text = first_sentence_breaking($text);
-       my $fixed_marked_text = remove_false_end_of_sentence($marked_text);
-       $fixed_marked_text = split_unsplit_stuff($fixed_marked_text);
-       my @sentences = split(/$EOS/,$fixed_marked_text);
+       $VERBOSE and print("ORIGINAL\n$text\n");
+       
+       $text = mark_up_abbreviations($text);
+       $VERBOSE and print("mark_up_abbreviations\n$text\n");
+       
+       $text = first_sentence_breaking($text);
+       $VERBOSE and print("first_sentence_breaking\n$text\n");
+       
+       $text = remove_false_end_of_sentence($text);
+       $VERBOSE and print("remove_false_end_of_sentence\n$text\n");
+       
+       $text = split_unsplit_stuff($text);
+       $VERBOSE and print("split_unsplit_stuff\n$text\n");
+       
+       my @sentences = split(/$EOS/,$text);
        my $cleaned_sentences = clean_sentences(\@sentences);
+       if ($VERBOSE) {
+               my $i;
+               foreach my $sent (@$cleaned_sentences) {
+                       $i++;
+                       print("SENTENCE $i >>>$sent<<<\n");
+               }
+       }
        return $cleaned_sentences;
 }
 
@@ -260,7 +312,6 @@
 #------------------------------------------------------------------------------
 sub add_acronyms {
        push @ABBREVIATIONS, @_;
-       _set_abbreviations_regex();
 }
 
 #------------------------------------------------------------------------------
@@ -275,7 +326,6 @@
 #------------------------------------------------------------------------------
 sub set_acronyms {
        @ABBREVIATIONS=@_;
-       _set_abbreviations_regex();
 }
 
 #------------------------------------------------------------------------------
@@ -295,7 +345,6 @@
                return $EOS;
        }
     $EOS = $new_EOS;
-    _set_abbreviations_regex();
     return $EOS;       
 }
 
@@ -341,24 +390,19 @@
        }       
 }
 
-
 #==============================================================================
 #
 # Private methods
 #
 #==============================================================================
 
-# save some time by pre-compiling a regex used for working with abbreviations
-sub _set_abbreviations_regex {
-    my $abbreviations = join '|', @ABBREVIATIONS;
-    $abbreviation_regex = qr[(\b(?:$abbreviations)$PAP\s)$EOS]is;
-    return;
-}
 
 ## Please email me any suggestions for optimizing these RegExps.
 sub remove_false_end_of_sentence {
        my ($marked_segment) = @_;
-##     ## don't do u.s.a.
+       
+       
+##     ## don't do U.S.A., U.K.
 ##     $marked_segment=~s/(\.\w$PAP)$EOS/$1/sg; 
        $marked_segment=~s/([^-\w]\w$PAP\s)$EOS/$1/sg;
        $marked_segment=~s/([^-\w]\w$P)$EOS/$1/sg;         
@@ -368,13 +412,12 @@
        # Note: will fail for 12. Point 12
        $marked_segment=~s/(\s[\w\d]\.\s+)$EOS/$1/sg; 
 
-       # fix: bla bla... yada yada
-       $marked_segment=~s/(\.\.\. )$EOS([[:lower:]])/$1$2/sg; 
-       # fix "." "?" "!"
-       $marked_segment=~s/(['"]$P['"]\s+)$EOS/$1/sg;
-       ## fix where abbreviations exist
-       $marked_segment=~s/$abbreviation_regex/$1/g;
+       # fix ellipsis: bla bla... yada yada
+       $marked_segment=~s/(\.\.\. )$EOS([[:lower:]])/$1$2/sg;
        
+       # fix quoted EOS such as "." "?" "!"
+       $marked_segment=~s/(['"]$P['"]\s+)$EOS/$1/sg;
+               
        # don't break after quote unless its a capital letter.
        $marked_segment=~s/(["']\s*)$EOS(\s*[[:lower:]])/$1$2/sg;
 
@@ -382,34 +425,42 @@
        $marked_segment=~s/(\s\.\s)$EOS(\s*)/$1$2/sg;
     $marked_segment=~s/(["']\s*)$EOS(\s*[[:lower:]])/$1$2/sg;
 
-
        $marked_segment=~s/(\s$PAP\s)$EOS/$1/sg;
+       
        return $marked_segment;
 }
 
 sub split_unsplit_stuff {
        my ($text) = @_;
+       
 
        # $text=~s/(\D\d+)($P)(\s+)/$1$2$EOS$3/sg; # breaks numbered points, 
such as {EOL}1. point one
 
-       $text=~s/([\w $P]\d)($P)(\s+)/$1$2$EOS$3/sg; 
-       $text=~s/($PAP\s)(\s*\()/$1$EOS$2/gs;
+       $text=~s/([\w $P]\d)($P)(\s+)/$1$2$EOS$3/sg;
+       
+       # eg 'end. (' -> 'end. $EOS ('
+       $text=~s/($PAP\s)(\s*\()/$1$EOS$2/gs; # open bracket
+       
        $text=~s/('\w$P)(\s)/$1$EOS$2/gs;
 
 
        $text=~s/(\sno\.)(\s+)(?!\d)/$1$EOS$2/gis;
 
-##     # split where single capital letter followed by dot makes sense to 
break.
-##     # notice these are exceptions to the general rule NOT to split on single
-##     # letter.
-##     # notice also that sibgle letter M is missing here, due to French 
'mister'
-##     # which is represented as M.
-##     #
-##     # the rule will not split on names begining or containing 
-##     # single capital letter dot in the first or second name
-##     # assuming 2 or three word name.
-##     
$text=~s/(\s[[:lower:]]\w+\s+[^[[:^upper:]M]\.)(?!\s+[[:upper:]]\.)/$1$EOS/sg;
-
+       # split where single capital letter followed by dot makes sense to 
break.
+       # notice these are exceptions to the general rule NOT to split on single
+       # letter.
+       # notice also that single letter M is missing here, due to French 
'mister'
+       # which is represented as M.
+       #
+       # the rule will not split on names beginning or containing 
+       # single capital letter dot in the first or second name
+       # assuming 2 or three word name.
+       
+       # NOT WORKING , it breaks up U.S.A. after U.
+       # Valid cases if single letter thrn dot are rare, such as 'The answer 
is F'.
+       # Can't decipher meaning of this regex
+       # 
$text=~s/(\s[[:lower:]]\w+\s+[^[[:^upper:]M]\.)(?!\s+[[:upper:]]\.)/$1$EOS/sg;
+       
 
        # add EOS when you see "a.m." or "p.m." followed by a capital letter.
        $text=~s/([ap]\.m\.\s+)([[:upper:]])/$1$EOS$2/gs;
@@ -426,16 +477,39 @@
                        $s=~s/^\s*//;
                        $s=~s/\s*$//;
 ##                     $s=~s/\s+/ /g;
+                       # replace end of abbrev marker with a dot
+                       $s=~s/$EOA/\./g;
                        push @$cleaned_sentences,$s;
                }
        return $cleaned_sentences;
 }
 
+# Replace seuence such as Mr. A. Smith Jnr. with Mr__EOA__ A__EOA__ etc
+# This simplifies the code that detects end of sentnees. The marker is
+# replaced with the original dot adter sentence slitting
+
+sub mark_up_abbreviations {
+       my ($text) = @_;
+       
+       my %found_abbrevs;      
+       foreach my $abbrev (@ABBREVIATIONS) {
+               if ($text=~/\b($abbrev)\./i) {
+                       $found_abbrevs{$abbrev} = 1;
+         }             
+       }
+
+       foreach my $abbrev (keys %found_abbrevs) {
+               $text=~s/($abbrev)\./$1$EOA/gs;
+       }
+       
+       return $text;
+}
+
 sub first_sentence_breaking {
        my ($text) = @_;
        $text=~s/\n\s*\n/$EOS/gs;       ## double new-line means a different 
sentence.
        $text=~s/($PAP\s)/$1$EOS/gs;
-       $text=~s/(\s\w$P)/$1$EOS/gs; # breake also when single letter comes 
before punc.
+       $text=~s/(\s\w$P)/$1$EOS/gs; # break also when single letter comes 
before punc.
        return $text;
 }
 
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Lingua-EN-Sentence-0.31/t/main.t 
new/Lingua-EN-Sentence-0.33/t/main.t
--- old/Lingua-EN-Sentence-0.31/t/main.t        2016-08-08 08:40:41.000000000 
+0200
+++ new/Lingua-EN-Sentence-0.33/t/main.t        2022-07-04 08:07:56.000000000 
+0200
@@ -17,20 +17,25 @@
 use Lingua::EN::Sentence qw( get_sentences add_acronyms get_acronyms);
 
 my $par = q{
-Returns the number of sentences in string.
-A sentence ends with a dot, exclamation or question mark followed by a space! 
-Dots after single letters such as U.S.A or e.g. are ignored,
-  as well as common abbreviations such as Dr. Ms. esp. Apr. Calif. and Ave.,
-  initials such as 'Mr. A. Smith'.
-This string has 4 sentences.
+A sentence usually ends with a dot, exclamation or question mark optionally 
followed by a space!
+A string followed by 2 carriage returns denotes a sentence, even though it 
doesn't end in a dot
+
+Dots after single letters such as U.S.A. or in numbers like -12.34 will not 
cause a split
+as well as common abbreviations such as Dr. I. Smith, Ms. A.B. Jones, Apr. 
Calif. Esq.
+and (some text) ellipsis such as ... or . . are ignored.
+Some valid cases canot be deteected, such as the answer is X. It cannot easily 
be
+differentiated from the single letter-dot sequence to abbreviate a person's 
given name.
+Numbered points within a sentence will not cause a split 1. Like this one.
+See the code for all the rules that apply.
+This string has 7 sentences.
 };
 
 my $sentences=get_sentences($par);     
-is( @$sentences, 4,'sub sentence_count');
+is( @$sentences, 7,'sub sentence_count');
 
-$par .= 'Now add an acronym, such as ret. for retired.';
+$par .= 'Now add an acronym, such as Ret. for retired.';
 add_acronyms('Ret');
 $sentences=get_sentences($par);
-is( @$sentences, 5,'sub add_acronyms');
+is( @$sentences, 8,'sub add_acronyms');

commit perl-Lingua-EN-Sentence for openSUSE:Factory

Reply via email to