Script 'mail_helper' called by obssrc
Hello community,
here is the log from the commit of package perl-Lingua-EN-Sentence for
openSUSE:Factory checked in at 2023-07-12 17:26:52
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/perl-Lingua-EN-Sentence (Old)
and /work/SRC/openSUSE:Factory/.perl-Lingua-EN-Sentence.new.8922 (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "perl-Lingua-EN-Sentence"
Wed Jul 12 17:26:52 2023 rev:17 rq:1098155 version:0.34
Changes:
--------
---
/work/SRC/openSUSE:Factory/perl-Lingua-EN-Sentence/perl-Lingua-EN-Sentence.changes
2022-07-13 15:35:05.333377829 +0200
+++
/work/SRC/openSUSE:Factory/.perl-Lingua-EN-Sentence.new.8922/perl-Lingua-EN-Sentence.changes
2023-07-12 17:27:11.142436451 +0200
@@ -1,0 +2,6 @@
+Wed Jun 21 03:07:10 UTC 2023 - Tina Müller <[email protected]>
+
+- updated to 0.34
+ see /usr/share/doc/packages/perl-Lingua-EN-Sentence/Changes
+
+-------------------------------------------------------------------
Old:
----
Lingua-EN-Sentence-0.33.tar.gz
New:
----
Lingua-EN-Sentence-0.34.tar.gz
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Other differences:
------------------
++++++ perl-Lingua-EN-Sentence.spec ++++++
--- /var/tmp/diff_new_pack.3R7GNd/_old 2023-07-12 17:27:11.922440998 +0200
+++ /var/tmp/diff_new_pack.3R7GNd/_new 2023-07-12 17:27:11.926441022 +0200
@@ -1,7 +1,7 @@
#
# spec file for package perl-Lingua-EN-Sentence
#
-# Copyright (c) 2022 SUSE LLC
+# Copyright (c) 2023 SUSE LLC
#
# All modifications and additions to the file contributed by third parties
# remain the property of their copyright owners, unless otherwise agreed
@@ -18,7 +18,7 @@
%define cpan_name Lingua-EN-Sentence
Name: perl-Lingua-EN-Sentence
-Version: 0.33
+Version: 0.34
Release: 0
License: Artistic-1.0 OR GPL-1.0-or-later
Summary: Split text into sentences
@@ -50,14 +50,14 @@
%autosetup -n %{cpan_name}-%{version}
%build
-perl Build.PL installdirs=vendor
-./Build build flags=%{?_smp_mflags}
+perl Build.PL --installdirs=vendor
+./Build build --flags=%{?_smp_mflags}
%check
./Build test
%install
-./Build install destdir=%{buildroot} create_packlist=0
+./Build install --destdir=%{buildroot} --create_packlist=0
%perl_gen_filelist
%files -f %{name}.files
++++++ Lingua-EN-Sentence-0.33.tar.gz -> Lingua-EN-Sentence-0.34.tar.gz ++++++
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/Lingua-EN-Sentence-0.33/Changes
new/Lingua-EN-Sentence-0.34/Changes
--- old/Lingua-EN-Sentence-0.33/Changes 2022-07-05 02:58:57.000000000 +0200
+++ new/Lingua-EN-Sentence-0.34/Changes 2023-06-20 02:47:11.000000000 +0200
@@ -105,7 +105,7 @@
Declared min version of Perl. Fix for RT bug #124686
0.32 July 2022
- fixed bug causing abbreviation followed by '(' to break sentnece,
reported in github
+ fixed bug causing abbreviation followed by '(' to break sentence,
reported in github
dot following an abbreviation now explicitly marked up
added more acronyms
improved documentation
@@ -113,4 +113,7 @@
added verbose moe for debugging
0.33 July 05 2022
- fixed version numbersin Build.PL and Makefile.PL
+ fixed version numbers in Build.PL and Makefile.PL
+
+0.34 June 20 2023
+ fixed version numbers in Build.PL and Makefile.PL
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/Lingua-EN-Sentence-0.33/META.json
new/Lingua-EN-Sentence-0.34/META.json
--- old/Lingua-EN-Sentence-0.33/META.json 2022-07-05 03:41:47.000000000
+0200
+++ new/Lingua-EN-Sentence-0.34/META.json 2023-06-20 03:56:25.000000000
+0200
@@ -34,7 +34,7 @@
"provides" : {
"Lingua::EN::Sentence" : {
"file" : "lib/Lingua/EN/Sentence.pm",
- "version" : "0.33"
+ "version" : "0.34"
}
},
"release_status" : "stable",
@@ -46,6 +46,6 @@
"url" : "https://github.com/kimryan/Lingua-EN-Sentence"
}
},
- "version" : "0.33",
+ "version" : "0.34",
"x_serialization_backend" : "JSON::PP version 4.04"
}
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/Lingua-EN-Sentence-0.33/META.yml
new/Lingua-EN-Sentence-0.34/META.yml
--- old/Lingua-EN-Sentence-0.33/META.yml 2022-07-05 03:41:47.000000000
+0200
+++ new/Lingua-EN-Sentence-0.34/META.yml 2023-06-20 03:56:25.000000000
+0200
@@ -16,12 +16,12 @@
provides:
Lingua::EN::Sentence:
file: lib/Lingua/EN/Sentence.pm
- version: '0.33'
+ version: '0.34'
requires:
perl: v5.10.0
warnings: '1.06'
resources:
license: http://dev.perl.org/licenses/
repository: https://github.com/kimryan/Lingua-EN-Sentence
-version: '0.33'
+version: '0.34'
x_serialization_backend: 'CPAN::Meta::YAML version 0.018'
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/Lingua-EN-Sentence-0.33/README
new/Lingua-EN-Sentence-0.34/README
--- old/Lingua-EN-Sentence-0.33/README 2016-08-09 03:22:20.000000000 +0200
+++ new/Lingua-EN-Sentence-0.34/README 2022-07-05 08:39:19.000000000 +0200
@@ -6,16 +6,33 @@
SYNOPSIS
use Lingua::EN::Sentence qw( get_sentences add_acronyms );
-
+
add_acronyms('lt','gen'); ## adding support for 'Lt. Gen.'
- my $sentences=get_sentences($text); ## Get the sentences.
- foreach my $sentence (@$sentences) {
- ## do something with $sentence
+ my $text = q{
+ A sentence usually ends with a dot, exclamation or question mark
optionally followed by a space!
+ A string followed by 2 carriage returns denotes a sentence, even though
it doesn't end in a dot
+
+ Dots after single letters such as U.S.A. or in numbers like -12.34 will
not cause a split
+ as well as common abbreviations such as Dr. I. Smith, Ms. A.B. Jones,
Apr. Calif. Esq.
+ and (some text) ellipsis such as ... or . . are ignored.
+ Some valid cases canot be deteected, such as the answer is X. It cannot
easily be
+ differentiated from the single letter-dot sequence to abbreviate a
person's given name.
+ Numbered points within a sentence will not cause a split 1. Like this
one.
+ See the code for all the rules that apply.
+ This string has 7 sentences.
+ };
+
+ my $sentences=get_sentences($text); # Get the sentences.
+ foreach my $sent (@$sentences)
+ {
+ $i++;
+ print("SENTENCE $i:$sent\n");
}
-
+
+
DESCRIPTION
-The Lingua::EN::Sentence module contains the function get_sentences, which
+The C<Lingua::EN::Sentence> module contains the function get_sentences, which
splits text into its constituent sentences, based on a regular expression and a
list of abbreviations (built in and given).
@@ -23,6 +40,8 @@
segmentations. But some of them are already integrated into this code and are
being taken care of. Still, if you see that there are words causing the
get_sentences function to fail, you can add those to the module, so it notices
them.
+Note that abbreviations are case sensitive, so 'Mrs.' is recognised but not
'mrs.'
+
INSTALLATION
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/Lingua-EN-Sentence-0.33/examples/demo.pl
new/Lingua-EN-Sentence-0.34/examples/demo.pl
--- old/Lingua-EN-Sentence-0.33/examples/demo.pl 2022-07-04
08:07:00.000000000 +0200
+++ new/Lingua-EN-Sentence-0.34/examples/demo.pl 2023-06-20
03:54:05.000000000 +0200
@@ -22,17 +22,20 @@
This string has 7 sentences.
};
-my $sentences=get_sentences($text); ## Get the sentences.
-my $num_sentences = (@$sentences);
-my $i;
-print("There are: $num_sentences sentences\n" );
-foreach my $sent (@$sentences)
+my $sentences=get_sentences($text);
+if (defined($sentences))
{
- $i++;
- print("SENTENCE $i:$sent\n");
+ my $num_sentences = (@$sentences);
+ my $i;
+ print("There are: $num_sentences sentences\n" );
+ foreach my $sent (@$sentences)
+ {
+ $i++;
+ print("SENTENCE $i:$sent\n");
+ }
}
- $text = q{First sentence.
+$text = q{First sentence.
12. point 12
Some numbers 12.46, -.123,3:.
Some âutf quotes wrap thisâ âAnd moreâ};
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/Lingua-EN-Sentence-0.33/lib/Lingua/EN/Sentence.pm
new/Lingua-EN-Sentence-0.34/lib/Lingua/EN/Sentence.pm
--- old/Lingua-EN-Sentence-0.33/lib/Lingua/EN/Sentence.pm 2022-07-05
03:02:14.000000000 +0200
+++ new/Lingua-EN-Sentence-0.34/lib/Lingua/EN/Sentence.pm 2023-06-20
02:47:49.000000000 +0200
@@ -1,10 +1,5 @@
package Lingua::EN::Sentence;
-#==============================================================================
-#
-# Start of POD
-#
-#==============================================================================
=head1 NAME
@@ -12,30 +7,32 @@
=head1 SYNOPSIS
- use Lingua::EN::Sentence qw( get_sentences add_acronyms );
-
- add_acronyms('lt','gen'); ## adding support for 'Lt. Gen.'
- my $text = q{
- A sentence usually ends with a dot, exclamation or question mark
optionally followed by a space!
- A string followed by 2 carriage returns denotes a sentence, even though
it doesn't end in a dot
-
- Dots after single letters such as U.S.A. or in numbers like -12.34 will
not cause a split
- as well as common abbreviations such as Dr. I. Smith, Ms. A.B. Jones,
Apr. Calif. Esq.
- and (some text) ellipsis such as ... or . . are ignored.
- Some valid cases canot be deteected, such as the answer is X. It cannot
easily be
- differentiated from the single letter-dot sequence to abbreviate a
person's given name.
- Numbered points within a sentence will not cause a split 1. Like this
one.
- See the code for all the rules that apply.
- This string has 7 sentences.
- };
-
- my $sentences=get_sentences($text); # Get the sentences.
+use Lingua::EN::Sentence qw( get_sentences add_acronyms );
+
+add_acronyms('lt','gen'); ## adding support for 'Lt. Gen.'
+my $text = q{
+A sentence usually ends with a dot, exclamation or question mark optionally
followed by a space!
+A string followed by 2 carriage returns denotes a sentence, even though it
doesn't end in a dot
+
+Dots after single letters such as U.S.A. or in numbers like -12.34 will not
cause a split
+as well as common abbreviations such as Dr. I. Smith, Ms. A.B. Jones, Apr.
Calif. Esq.
+and (some text) ellipsis such as ... or . . are ignored.
+Some valid cases canot be deteected, such as the answer is X. It cannot easily
be
+differentiated from the single letter-dot sequence to abbreviate a person's
given name.
+Numbered points within a sentence will not cause a split 1. Like this one.
+See the code for all the rules that apply.
+This string has 7 sentences.
+};
+
+if (defined($sentences))
+{
+ my $sentences = get_sentences($text);
foreach my $sent (@$sentences)
{
$i++;
print("SENTENCE $i:$sent\n");
}
-
+}
=head1 DESCRIPTION
@@ -87,7 +84,8 @@
and returns a reference to an array of sentences that the text has been split
into. Returned sentences will be trimmed (beginning and end of sentence) of
white space. Strings with no alpha-numeric characters in them, won't be
-returned as sentences.
+returned as sentences. If no text is supplied, a reference to an empty array
+is returned.
=item add_acronyms( @acronyms )
@@ -189,13 +187,6 @@
#==============================================================================
#
-# End of POD
-#
-#==============================================================================
-
-
-#==============================================================================
-#
# Pragmas
#
#==============================================================================
@@ -219,7 +210,7 @@
use Carp qw/cluck/;
use English;
-our $VERSION = '0.33';
+our $VERSION = '0.34';
our $LOC;
if ($OSNAME ne 'android') {
@@ -234,7 +225,7 @@
@EXPORT_OK = qw( get_sentences add_acronyms get_acronyms set_acronyms get_EOS
set_EOS set_locale);
our $VERBOSE = 0; # echo intermediate data transforms, useful for debugging
-our $EOS = "\001"; #"__EOS__";
+our $EOS = "\001";
our $EOA = '__EOA__';
our $P = q/[\.!?]/; # PUNCTUATION
@@ -280,7 +271,12 @@
#------------------------------------------------------------------------------
sub get_sentences {
my ($text) = @_;
- return [] unless defined $text;
+
+ unless (defined($text))
+ {
+ return [];
+ }
+
$VERBOSE and print("ORIGINAL\n$text\n");
$text = mark_up_abbreviations($text);
@@ -395,15 +391,11 @@
# Private methods
#
#==============================================================================
-
-
-## Please email me any suggestions for optimizing these RegExps.
sub remove_false_end_of_sentence {
my ($marked_segment) = @_;
-## ## don't do U.S.A., U.K.
-## $marked_segment=~s/(\.\w$PAP)$EOS/$1/sg;
+ # don't split U.S.A., U.K.
$marked_segment=~s/([^-\w]\w$PAP\s)$EOS/$1/sg;
$marked_segment=~s/([^-\w]\w$P)$EOS/$1/sg;
@@ -434,34 +426,15 @@
my ($text) = @_;
- # $text=~s/(\D\d+)($P)(\s+)/$1$2$EOS$3/sg; # breaks numbered points,
such as {EOL}1. point one
-
+ # breaks numbered points, such as {EOL}1. point one
$text=~s/([\w $P]\d)($P)(\s+)/$1$2$EOS$3/sg;
# eg 'end. (' -> 'end. $EOS ('
- $text=~s/($PAP\s)(\s*\()/$1$EOS$2/gs; # open bracket
-
+ $text=~s/($PAP\s)(\s*\()/$1$EOS$2/gs; # open bracket
$text=~s/('\w$P)(\s)/$1$EOS$2/gs;
-
$text=~s/(\sno\.)(\s+)(?!\d)/$1$EOS$2/gis;
- # split where single capital letter followed by dot makes sense to
break.
- # notice these are exceptions to the general rule NOT to split on single
- # letter.
- # notice also that single letter M is missing here, due to French
'mister'
- # which is represented as M.
- #
- # the rule will not split on names beginning or containing
- # single capital letter dot in the first or second name
- # assuming 2 or three word name.
-
- # NOT WORKING , it breaks up U.S.A. after U.
- # Valid cases if single letter thrn dot are rare, such as 'The answer
is F'.
- # Can't decipher meaning of this regex
- #
$text=~s/(\s[[:lower:]]\w+\s+[^[[:^upper:]M]\.)(?!\s+[[:upper:]]\.)/$1$EOS/sg;
-
-
# add EOS when you see "a.m." or "p.m." followed by a capital letter.
$text=~s/([ap]\.m\.\s+)([[:upper:]])/$1$EOS$2/gs;
@@ -484,7 +457,7 @@
return $cleaned_sentences;
}
-# Replace seuence such as Mr. A. Smith Jnr. with Mr__EOA__ A__EOA__ etc
+# Replace sequence such as Mr. A. Smith Jnr. with Mr__EOA__ A__EOA__ etc
# This simplifies the code that detects end of sentnees. The marker is
# replaced with the original dot adter sentence slitting
@@ -513,11 +486,4 @@
return $text;
}
-
-#==============================================================================
-#
-# Return TRUE
-#
-#==============================================================================
-
1;