Script 'mail_helper' called by obssrc Hello community, here is the log from the commit of package perl-Lingua-EN-Sentence for openSUSE:Factory checked in at 2023-07-12 17:26:52 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/perl-Lingua-EN-Sentence (Old) and /work/SRC/openSUSE:Factory/.perl-Lingua-EN-Sentence.new.8922 (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "perl-Lingua-EN-Sentence" Wed Jul 12 17:26:52 2023 rev:17 rq:1098155 version:0.34 Changes: -------- --- /work/SRC/openSUSE:Factory/perl-Lingua-EN-Sentence/perl-Lingua-EN-Sentence.changes 2022-07-13 15:35:05.333377829 +0200 +++ /work/SRC/openSUSE:Factory/.perl-Lingua-EN-Sentence.new.8922/perl-Lingua-EN-Sentence.changes 2023-07-12 17:27:11.142436451 +0200 @@ -1,0 +2,6 @@ +Wed Jun 21 03:07:10 UTC 2023 - Tina Müller <timueller+p...@suse.de> + +- updated to 0.34 + see /usr/share/doc/packages/perl-Lingua-EN-Sentence/Changes + +------------------------------------------------------------------- Old: ---- Lingua-EN-Sentence-0.33.tar.gz New: ---- Lingua-EN-Sentence-0.34.tar.gz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ perl-Lingua-EN-Sentence.spec ++++++ --- /var/tmp/diff_new_pack.3R7GNd/_old 2023-07-12 17:27:11.922440998 +0200 +++ /var/tmp/diff_new_pack.3R7GNd/_new 2023-07-12 17:27:11.926441022 +0200 @@ -1,7 +1,7 @@ # # spec file for package perl-Lingua-EN-Sentence # -# Copyright (c) 2022 SUSE LLC +# Copyright (c) 2023 SUSE LLC # # All modifications and additions to the file contributed by third parties # remain the property of their copyright owners, unless otherwise agreed @@ -18,7 +18,7 @@ %define cpan_name Lingua-EN-Sentence Name: perl-Lingua-EN-Sentence -Version: 0.33 +Version: 0.34 Release: 0 License: Artistic-1.0 OR GPL-1.0-or-later Summary: Split text into sentences @@ -50,14 +50,14 @@ %autosetup -n %{cpan_name}-%{version} %build -perl Build.PL installdirs=vendor -./Build build flags=%{?_smp_mflags} +perl Build.PL --installdirs=vendor +./Build build --flags=%{?_smp_mflags} %check ./Build test %install -./Build install destdir=%{buildroot} create_packlist=0 +./Build install --destdir=%{buildroot} --create_packlist=0 %perl_gen_filelist %files -f %{name}.files ++++++ Lingua-EN-Sentence-0.33.tar.gz -> Lingua-EN-Sentence-0.34.tar.gz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/Lingua-EN-Sentence-0.33/Changes new/Lingua-EN-Sentence-0.34/Changes --- old/Lingua-EN-Sentence-0.33/Changes 2022-07-05 02:58:57.000000000 +0200 +++ new/Lingua-EN-Sentence-0.34/Changes 2023-06-20 02:47:11.000000000 +0200 @@ -105,7 +105,7 @@ Declared min version of Perl. Fix for RT bug #124686 0.32 July 2022 - fixed bug causing abbreviation followed by '(' to break sentnece, reported in github + fixed bug causing abbreviation followed by '(' to break sentence, reported in github dot following an abbreviation now explicitly marked up added more acronyms improved documentation @@ -113,4 +113,7 @@ added verbose moe for debugging 0.33 July 05 2022 - fixed version numbersin Build.PL and Makefile.PL + fixed version numbers in Build.PL and Makefile.PL + +0.34 June 20 2023 + fixed version numbers in Build.PL and Makefile.PL diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/Lingua-EN-Sentence-0.33/META.json new/Lingua-EN-Sentence-0.34/META.json --- old/Lingua-EN-Sentence-0.33/META.json 2022-07-05 03:41:47.000000000 +0200 +++ new/Lingua-EN-Sentence-0.34/META.json 2023-06-20 03:56:25.000000000 +0200 @@ -34,7 +34,7 @@ "provides" : { "Lingua::EN::Sentence" : { "file" : "lib/Lingua/EN/Sentence.pm", - "version" : "0.33" + "version" : "0.34" } }, "release_status" : "stable", @@ -46,6 +46,6 @@ "url" : "https://github.com/kimryan/Lingua-EN-Sentence" } }, - "version" : "0.33", + "version" : "0.34", "x_serialization_backend" : "JSON::PP version 4.04" } diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/Lingua-EN-Sentence-0.33/META.yml new/Lingua-EN-Sentence-0.34/META.yml --- old/Lingua-EN-Sentence-0.33/META.yml 2022-07-05 03:41:47.000000000 +0200 +++ new/Lingua-EN-Sentence-0.34/META.yml 2023-06-20 03:56:25.000000000 +0200 @@ -16,12 +16,12 @@ provides: Lingua::EN::Sentence: file: lib/Lingua/EN/Sentence.pm - version: '0.33' + version: '0.34' requires: perl: v5.10.0 warnings: '1.06' resources: license: http://dev.perl.org/licenses/ repository: https://github.com/kimryan/Lingua-EN-Sentence -version: '0.33' +version: '0.34' x_serialization_backend: 'CPAN::Meta::YAML version 0.018' diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/Lingua-EN-Sentence-0.33/README new/Lingua-EN-Sentence-0.34/README --- old/Lingua-EN-Sentence-0.33/README 2016-08-09 03:22:20.000000000 +0200 +++ new/Lingua-EN-Sentence-0.34/README 2022-07-05 08:39:19.000000000 +0200 @@ -6,16 +6,33 @@ SYNOPSIS use Lingua::EN::Sentence qw( get_sentences add_acronyms ); - + add_acronyms('lt','gen'); ## adding support for 'Lt. Gen.' - my $sentences=get_sentences($text); ## Get the sentences. - foreach my $sentence (@$sentences) { - ## do something with $sentence + my $text = q{ + A sentence usually ends with a dot, exclamation or question mark optionally followed by a space! + A string followed by 2 carriage returns denotes a sentence, even though it doesn't end in a dot + + Dots after single letters such as U.S.A. or in numbers like -12.34 will not cause a split + as well as common abbreviations such as Dr. I. Smith, Ms. A.B. Jones, Apr. Calif. Esq. + and (some text) ellipsis such as ... or . . are ignored. + Some valid cases canot be deteected, such as the answer is X. It cannot easily be + differentiated from the single letter-dot sequence to abbreviate a person's given name. + Numbered points within a sentence will not cause a split 1. Like this one. + See the code for all the rules that apply. + This string has 7 sentences. + }; + + my $sentences=get_sentences($text); # Get the sentences. + foreach my $sent (@$sentences) + { + $i++; + print("SENTENCE $i:$sent\n"); } - + + DESCRIPTION -The Lingua::EN::Sentence module contains the function get_sentences, which +The C<Lingua::EN::Sentence> module contains the function get_sentences, which splits text into its constituent sentences, based on a regular expression and a list of abbreviations (built in and given). @@ -23,6 +40,8 @@ segmentations. But some of them are already integrated into this code and are being taken care of. Still, if you see that there are words causing the get_sentences function to fail, you can add those to the module, so it notices them. +Note that abbreviations are case sensitive, so 'Mrs.' is recognised but not 'mrs.' + INSTALLATION diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/Lingua-EN-Sentence-0.33/examples/demo.pl new/Lingua-EN-Sentence-0.34/examples/demo.pl --- old/Lingua-EN-Sentence-0.33/examples/demo.pl 2022-07-04 08:07:00.000000000 +0200 +++ new/Lingua-EN-Sentence-0.34/examples/demo.pl 2023-06-20 03:54:05.000000000 +0200 @@ -22,17 +22,20 @@ This string has 7 sentences. }; -my $sentences=get_sentences($text); ## Get the sentences. -my $num_sentences = (@$sentences); -my $i; -print("There are: $num_sentences sentences\n" ); -foreach my $sent (@$sentences) +my $sentences=get_sentences($text); +if (defined($sentences)) { - $i++; - print("SENTENCE $i:$sent\n"); + my $num_sentences = (@$sentences); + my $i; + print("There are: $num_sentences sentences\n" ); + foreach my $sent (@$sentences) + { + $i++; + print("SENTENCE $i:$sent\n"); + } } - $text = q{First sentence. +$text = q{First sentence. 12. point 12 Some numbers 12.46, -.123,3:. Some âutf quotes wrap thisâ âAnd moreâ}; diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/Lingua-EN-Sentence-0.33/lib/Lingua/EN/Sentence.pm new/Lingua-EN-Sentence-0.34/lib/Lingua/EN/Sentence.pm --- old/Lingua-EN-Sentence-0.33/lib/Lingua/EN/Sentence.pm 2022-07-05 03:02:14.000000000 +0200 +++ new/Lingua-EN-Sentence-0.34/lib/Lingua/EN/Sentence.pm 2023-06-20 02:47:49.000000000 +0200 @@ -1,10 +1,5 @@ package Lingua::EN::Sentence; -#============================================================================== -# -# Start of POD -# -#============================================================================== =head1 NAME @@ -12,30 +7,32 @@ =head1 SYNOPSIS - use Lingua::EN::Sentence qw( get_sentences add_acronyms ); - - add_acronyms('lt','gen'); ## adding support for 'Lt. Gen.' - my $text = q{ - A sentence usually ends with a dot, exclamation or question mark optionally followed by a space! - A string followed by 2 carriage returns denotes a sentence, even though it doesn't end in a dot - - Dots after single letters such as U.S.A. or in numbers like -12.34 will not cause a split - as well as common abbreviations such as Dr. I. Smith, Ms. A.B. Jones, Apr. Calif. Esq. - and (some text) ellipsis such as ... or . . are ignored. - Some valid cases canot be deteected, such as the answer is X. It cannot easily be - differentiated from the single letter-dot sequence to abbreviate a person's given name. - Numbered points within a sentence will not cause a split 1. Like this one. - See the code for all the rules that apply. - This string has 7 sentences. - }; - - my $sentences=get_sentences($text); # Get the sentences. +use Lingua::EN::Sentence qw( get_sentences add_acronyms ); + +add_acronyms('lt','gen'); ## adding support for 'Lt. Gen.' +my $text = q{ +A sentence usually ends with a dot, exclamation or question mark optionally followed by a space! +A string followed by 2 carriage returns denotes a sentence, even though it doesn't end in a dot + +Dots after single letters such as U.S.A. or in numbers like -12.34 will not cause a split +as well as common abbreviations such as Dr. I. Smith, Ms. A.B. Jones, Apr. Calif. Esq. +and (some text) ellipsis such as ... or . . are ignored. +Some valid cases canot be deteected, such as the answer is X. It cannot easily be +differentiated from the single letter-dot sequence to abbreviate a person's given name. +Numbered points within a sentence will not cause a split 1. Like this one. +See the code for all the rules that apply. +This string has 7 sentences. +}; + +if (defined($sentences)) +{ + my $sentences = get_sentences($text); foreach my $sent (@$sentences) { $i++; print("SENTENCE $i:$sent\n"); } - +} =head1 DESCRIPTION @@ -87,7 +84,8 @@ and returns a reference to an array of sentences that the text has been split into. Returned sentences will be trimmed (beginning and end of sentence) of white space. Strings with no alpha-numeric characters in them, won't be -returned as sentences. +returned as sentences. If no text is supplied, a reference to an empty array +is returned. =item add_acronyms( @acronyms ) @@ -189,13 +187,6 @@ #============================================================================== # -# End of POD -# -#============================================================================== - - -#============================================================================== -# # Pragmas # #============================================================================== @@ -219,7 +210,7 @@ use Carp qw/cluck/; use English; -our $VERSION = '0.33'; +our $VERSION = '0.34'; our $LOC; if ($OSNAME ne 'android') { @@ -234,7 +225,7 @@ @EXPORT_OK = qw( get_sentences add_acronyms get_acronyms set_acronyms get_EOS set_EOS set_locale); our $VERBOSE = 0; # echo intermediate data transforms, useful for debugging -our $EOS = "\001"; #"__EOS__"; +our $EOS = "\001"; our $EOA = '__EOA__'; our $P = q/[\.!?]/; # PUNCTUATION @@ -280,7 +271,12 @@ #------------------------------------------------------------------------------ sub get_sentences { my ($text) = @_; - return [] unless defined $text; + + unless (defined($text)) + { + return []; + } + $VERBOSE and print("ORIGINAL\n$text\n"); $text = mark_up_abbreviations($text); @@ -395,15 +391,11 @@ # Private methods # #============================================================================== - - -## Please email me any suggestions for optimizing these RegExps. sub remove_false_end_of_sentence { my ($marked_segment) = @_; -## ## don't do U.S.A., U.K. -## $marked_segment=~s/(\.\w$PAP)$EOS/$1/sg; + # don't split U.S.A., U.K. $marked_segment=~s/([^-\w]\w$PAP\s)$EOS/$1/sg; $marked_segment=~s/([^-\w]\w$P)$EOS/$1/sg; @@ -434,34 +426,15 @@ my ($text) = @_; - # $text=~s/(\D\d+)($P)(\s+)/$1$2$EOS$3/sg; # breaks numbered points, such as {EOL}1. point one - + # breaks numbered points, such as {EOL}1. point one $text=~s/([\w $P]\d)($P)(\s+)/$1$2$EOS$3/sg; # eg 'end. (' -> 'end. $EOS (' - $text=~s/($PAP\s)(\s*\()/$1$EOS$2/gs; # open bracket - + $text=~s/($PAP\s)(\s*\()/$1$EOS$2/gs; # open bracket $text=~s/('\w$P)(\s)/$1$EOS$2/gs; - $text=~s/(\sno\.)(\s+)(?!\d)/$1$EOS$2/gis; - # split where single capital letter followed by dot makes sense to break. - # notice these are exceptions to the general rule NOT to split on single - # letter. - # notice also that single letter M is missing here, due to French 'mister' - # which is represented as M. - # - # the rule will not split on names beginning or containing - # single capital letter dot in the first or second name - # assuming 2 or three word name. - - # NOT WORKING , it breaks up U.S.A. after U. - # Valid cases if single letter thrn dot are rare, such as 'The answer is F'. - # Can't decipher meaning of this regex - # $text=~s/(\s[[:lower:]]\w+\s+[^[[:^upper:]M]\.)(?!\s+[[:upper:]]\.)/$1$EOS/sg; - - # add EOS when you see "a.m." or "p.m." followed by a capital letter. $text=~s/([ap]\.m\.\s+)([[:upper:]])/$1$EOS$2/gs; @@ -484,7 +457,7 @@ return $cleaned_sentences; } -# Replace seuence such as Mr. A. Smith Jnr. with Mr__EOA__ A__EOA__ etc +# Replace sequence such as Mr. A. Smith Jnr. with Mr__EOA__ A__EOA__ etc # This simplifies the code that detects end of sentnees. The marker is # replaced with the original dot adter sentence slitting @@ -513,11 +486,4 @@ return $text; } - -#============================================================================== -# -# Return TRUE -# -#============================================================================== - 1;