Script 'mail_helper' called by obssrc Hello community, here is the log from the commit of package perl-Lingua-EN-Sentence for openSUSE:Factory checked in at 2022-07-13 15:35:04 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/perl-Lingua-EN-Sentence (Old) and /work/SRC/openSUSE:Factory/.perl-Lingua-EN-Sentence.new.1523 (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "perl-Lingua-EN-Sentence" Wed Jul 13 15:35:04 2022 rev:16 rq:988972 version:0.33 Changes: -------- --- /work/SRC/openSUSE:Factory/perl-Lingua-EN-Sentence/perl-Lingua-EN-Sentence.changes 2018-08-31 10:43:51.167160167 +0200 +++ /work/SRC/openSUSE:Factory/.perl-Lingua-EN-Sentence.new.1523/perl-Lingua-EN-Sentence.changes 2022-07-13 15:35:05.333377829 +0200 @@ -1,0 +2,14 @@ +Tue Jul 5 03:06:22 UTC 2022 - Tina M??ller <timueller+p...@suse.de> + +- updated to 0.33 + see /usr/share/doc/packages/perl-Lingua-EN-Sentence/Changes + + 0.32 July 2022 + fixed bug causing abbreviation followed by '(' to break sentnece, reported in github + dot following an abbreviation now explicitly marked up + added more acronyms + improved documentation + improved tests + added verbose moe for debugging + +------------------------------------------------------------------- Old: ---- Lingua-EN-Sentence-0.31.tar.gz New: ---- Lingua-EN-Sentence-0.33.tar.gz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ perl-Lingua-EN-Sentence.spec ++++++ --- /var/tmp/diff_new_pack.RWUygO/_old 2022-07-13 15:35:05.885378679 +0200 +++ /var/tmp/diff_new_pack.RWUygO/_new 2022-07-13 15:35:05.889378686 +0200 @@ -1,7 +1,7 @@ # # spec file for package perl-Lingua-EN-Sentence # -# Copyright (c) 2018 SUSE LINUX GmbH, Nuernberg, Germany. +# Copyright (c) 2022 SUSE LLC # # All modifications and additions to the file contributed by third parties # remain the property of their copyright owners, unless otherwise agreed @@ -12,28 +12,26 @@ # license that conforms to the Open Source Definition (Version 1.9) # published by the Open Source Initiative. -# Please submit bugfixes or comments via http://bugs.opensuse.org/ +# Please submit bugfixes or comments via https://bugs.opensuse.org/ # +%define cpan_name Lingua-EN-Sentence Name: perl-Lingua-EN-Sentence -Version: 0.31 +Version: 0.33 Release: 0 -%define cpan_name Lingua-EN-Sentence -Summary: Split Text Into Sentences License: Artistic-1.0 OR GPL-1.0-or-later -Group: Development/Libraries/Perl -Url: http://search.cpan.org/dist/Lingua-EN-Sentence/ +Summary: Split text into sentences +URL: https://metacpan.org/release/%{cpan_name} Source0: https://cpan.metacpan.org/authors/id/K/KI/KIMRYAN/%{cpan_name}-%{version}.tar.gz Source1: cpanspec.yml BuildArch: noarch -BuildRoot: %{_tmppath}/%{name}-%{version}-build BuildRequires: perl BuildRequires: perl-macros BuildRequires: perl(Module::Build) >= 0.380000 BuildRequires: perl(Test::More) >= 0.94 -BuildRequires: perl(warnings) >= 1.12 -Requires: perl(warnings) >= 1.12 +BuildRequires: perl(warnings) >= 1.06 +Requires: perl(warnings) >= 1.06 %{perl_requires} %description @@ -45,25 +43,25 @@ segmentations. But some of them are already integrated into this code and are being taken care of. Still, if you see that there are words causing the get_sentences function to fail, you can add those to the module, so it -notices them. +notices them. Note that abbreviations are case sensitive, so 'Mrs.' is +recognised but not 'mrs.' %prep -%setup -q -n %{cpan_name}-%{version} +%autosetup -n %{cpan_name}-%{version} %build -%{__perl} Makefile.PL INSTALLDIRS=vendor -%{__make} %{?_smp_mflags} +perl Build.PL installdirs=vendor +./Build build flags=%{?_smp_mflags} %check -%{__make} test +./Build test %install -%perl_make_install -%perl_process_packlist +./Build install destdir=%{buildroot} create_packlist=0 %perl_gen_filelist %files -f %{name}.files -%defattr(-,root,root,755) -%doc Changes examples LICENCE README +%doc Changes examples README +%license LICENCE %changelog ++++++ Lingua-EN-Sentence-0.31.tar.gz -> Lingua-EN-Sentence-0.33.tar.gz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/Lingua-EN-Sentence-0.31/Build.PL new/Lingua-EN-Sentence-0.33/Build.PL --- old/Lingua-EN-Sentence-0.31/Build.PL 1970-01-01 01:00:00.000000000 +0100 +++ new/Lingua-EN-Sentence-0.33/Build.PL 2022-07-05 03:36:50.000000000 +0200 @@ -0,0 +1,29 @@ +use Module::Build; + +Module::Build -> new +( + module_name => 'Lingua::EN::Sentence', + license => 'perl', + dist_abstract => "Split text into sentences", + dist_author => 'Shlomo Yona, Kim Ryan <kimryan at cpan org>', + build_requires => + { + Test::More => 0.94, + }, + configure_requires => + { + Module::Build => 0.3800, + }, + requires => + { + 'perl' => '5.10.0', + warnings => '1.06' + }, + meta_merge => + { + resources => + { + repository => 'https://github.com/kimryan/Lingua-EN-Sentence' + } + } +) -> create_build_script(); diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/Lingua-EN-Sentence-0.31/Changes new/Lingua-EN-Sentence-0.33/Changes --- old/Lingua-EN-Sentence-0.31/Changes 2018-08-19 10:06:56.000000000 +0200 +++ new/Lingua-EN-Sentence-0.33/Changes 2022-07-05 02:58:57.000000000 +0200 @@ -104,4 +104,13 @@ 0.31 Aug 19 2018 Declared min version of Perl. Fix for RT bug #124686 +0.32 July 2022 + fixed bug causing abbreviation followed by '(' to break sentnece, reported in github + dot following an abbreviation now explicitly marked up + added more acronyms + improved documentation + improved tests + added verbose moe for debugging +0.33 July 05 2022 + fixed version numbersin Build.PL and Makefile.PL diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/Lingua-EN-Sentence-0.31/MANIFEST new/Lingua-EN-Sentence-0.33/MANIFEST --- old/Lingua-EN-Sentence-0.31/MANIFEST 2016-08-08 08:43:14.000000000 +0200 +++ new/Lingua-EN-Sentence-0.33/MANIFEST 2022-07-04 08:13:30.000000000 +0200 @@ -3,6 +3,7 @@ MANIFEST LICENCE Makefile.PL +Build.PL lib/Lingua/EN/Sentence.pm t/main.t examples/demo.pl diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/Lingua-EN-Sentence-0.31/META.json new/Lingua-EN-Sentence-0.33/META.json --- old/Lingua-EN-Sentence-0.31/META.json 2018-08-19 10:26:07.000000000 +0200 +++ new/Lingua-EN-Sentence-0.33/META.json 2022-07-05 03:41:47.000000000 +0200 @@ -4,13 +4,13 @@ "Shlomo Yona, Kim Ryan <kimryan at cpan org>" ], "dynamic_config" : 1, - "generated_by" : "Module::Build version 0.4208", + "generated_by" : "Module::Build version 0.4229", "license" : [ "perl_5" ], "meta-spec" : { "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec", - "version" : "2" + "version" : 2 }, "name" : "Lingua-EN-Sentence", "prereqs" : { @@ -26,15 +26,15 @@ }, "runtime" : { "requires" : { - "perl" : "5.1", - "warnings" : "1.12" + "perl" : "v5.10.0", + "warnings" : "1.06" } } }, "provides" : { "Lingua::EN::Sentence" : { "file" : "lib/Lingua/EN/Sentence.pm", - "version" : "0.31" + "version" : "0.33" } }, "release_status" : "stable", @@ -46,6 +46,6 @@ "url" : "https://github.com/kimryan/Lingua-EN-Sentence" } }, - "version" : "0.31", - "x_serialization_backend" : "JSON::PP version 2.27300" + "version" : "0.33", + "x_serialization_backend" : "JSON::PP version 4.04" } diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/Lingua-EN-Sentence-0.31/META.yml new/Lingua-EN-Sentence-0.33/META.yml --- old/Lingua-EN-Sentence-0.31/META.yml 2018-08-19 10:26:07.000000000 +0200 +++ new/Lingua-EN-Sentence-0.33/META.yml 2022-07-05 03:41:47.000000000 +0200 @@ -7,7 +7,7 @@ configure_requires: Module::Build: '0.38' dynamic_config: 1 -generated_by: 'Module::Build version 0.4208, CPAN::Meta::Converter version 2.150005' +generated_by: 'Module::Build version 0.4229, CPAN::Meta::Converter version 2.150010' license: perl meta-spec: url: http://module-build.sourceforge.net/META-spec-v1.4.html @@ -16,12 +16,12 @@ provides: Lingua::EN::Sentence: file: lib/Lingua/EN/Sentence.pm - version: '0.31' + version: '0.33' requires: - perl: '5.1' - warnings: '1.12' + perl: v5.10.0 + warnings: '1.06' resources: license: http://dev.perl.org/licenses/ repository: https://github.com/kimryan/Lingua-EN-Sentence -version: '0.31' -x_serialization_backend: 'CPAN::Meta::YAML version 0.016' +version: '0.33' +x_serialization_backend: 'CPAN::Meta::YAML version 0.018' diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/Lingua-EN-Sentence-0.31/Makefile.PL new/Lingua-EN-Sentence-0.33/Makefile.PL --- old/Lingua-EN-Sentence-0.31/Makefile.PL 2016-08-08 09:09:39.000000000 +0200 +++ new/Lingua-EN-Sentence-0.33/Makefile.PL 2022-07-05 03:35:59.000000000 +0200 @@ -7,8 +7,9 @@ 'VERSION_FROM' => 'lib/Lingua/EN/Sentence.pm', # finds $VERSION 'PREREQ_PM' => { - 'warnings' => 1.12 - }, + 'perl' => '5.10.0', + 'warnings' => '1.06' + }, 'AUTHOR' => 'Shlomo Yona, Kim Ryan', 'LICENSE' => 'perl', META_MERGE => { @@ -16,8 +17,8 @@ resources => { repository => { type => 'git', - url => 'https://github.com/dwimperl/Task-DWIM.git', - web => 'https://github.com/dwimperl/Task-DWIM', + url => 'https://github.com/kimryan/Lingua-EN-Sentence.git', + web => 'https://github.com/kimryan/Lingua-EN-Sentence' }, }, } diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/Lingua-EN-Sentence-0.31/examples/demo.pl new/Lingua-EN-Sentence-0.33/examples/demo.pl --- old/Lingua-EN-Sentence-0.31/examples/demo.pl 2016-08-08 08:30:56.000000000 +0200 +++ new/Lingua-EN-Sentence-0.33/examples/demo.pl 2022-07-04 08:07:00.000000000 +0200 @@ -3,14 +3,36 @@ Demo program of CPAN module Lingua::EN::Sentence, sentence splitter =cut - use strict; use warnings; +use Lingua::EN::Sentence qw( get_sentences add_acronyms get_EOS set_EOS set_locale); +print("Started\n"); +my $text = q{ +A sentence usually ends with a dot, exclamation or question mark optionally followed by a space! +A string followed by 2 carriage returns denotes a sentence, even though it doesn't end in a dot + +Dots after single letters such as U.S.A. or in numbers like -12.34 will not cause a split +as well as common abbreviations such as Dr. I. Smith, Ms. A.B. Jones, Apr. Calif. Esq. +and (some text) ellipsis such as ... or . . are ignored. +Some valid cases canot be deteected, such as the answer is X. It cannot easily be +differentiated from the single letter-dot sequence to abbreviate a person's given name. +Numbered points within a sentence will not cause a split 1. Like this one. +See the code for all the rules that apply. +This string has 7 sentences. +}; -use Lingua::EN::Sentence qw( get_sentences add_acronyms get_EOS set_EOS set_locale); +my $sentences=get_sentences($text); ## Get the sentences. +my $num_sentences = (@$sentences); +my $i; +print("There are: $num_sentences sentences\n" ); +foreach my $sent (@$sentences) +{ + $i++; + print("SENTENCE $i:$sent\n"); +} -my $text = q{First sentence. + $text = q{First sentence. 12. point 12 Some numbers 12.46, -.123,3:. Some ???utf quotes wrap this??? ???And more???}; @@ -23,22 +45,21 @@ $text =~ s/???/"/g; # Change lines starting with numbered points from x. to x) to avoid confusion with dots -$text =~ s/\n(\d{1,})./\n$1\)/g; - - -print("Started\n"); -my $sentences=get_sentences($text); ## Get the sentences. + $text =~ s/\n(\d{1,})./\n$1\)/g; + + $sentences=get_sentences($text); ## Get the sentences. my $num_sentences = (@$sentences); - -print("There are: $num_sentences sentences\n" ); -my $i; foreach my $sent (@$sentences) { - $i++; - print("SENTENCE $i >>>$sent<<<\n"); + print("$sent\n"); } + + + + + diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/Lingua-EN-Sentence-0.31/lib/Lingua/EN/Sentence.pm new/Lingua-EN-Sentence-0.33/lib/Lingua/EN/Sentence.pm --- old/Lingua-EN-Sentence-0.31/lib/Lingua/EN/Sentence.pm 2018-08-19 10:19:59.000000000 +0200 +++ new/Lingua-EN-Sentence-0.33/lib/Lingua/EN/Sentence.pm 2022-07-05 03:02:14.000000000 +0200 @@ -13,11 +13,27 @@ =head1 SYNOPSIS use Lingua::EN::Sentence qw( get_sentences add_acronyms ); - + add_acronyms('lt','gen'); ## adding support for 'Lt. Gen.' - my $sentences=get_sentences($text); ## Get the sentences. - foreach my $sentence (@$sentences) { - ## do something with $sentence + my $text = q{ + A sentence usually ends with a dot, exclamation or question mark optionally followed by a space! + A string followed by 2 carriage returns denotes a sentence, even though it doesn't end in a dot + + Dots after single letters such as U.S.A. or in numbers like -12.34 will not cause a split + as well as common abbreviations such as Dr. I. Smith, Ms. A.B. Jones, Apr. Calif. Esq. + and (some text) ellipsis such as ... or . . are ignored. + Some valid cases canot be deteected, such as the answer is X. It cannot easily be + differentiated from the single letter-dot sequence to abbreviate a person's given name. + Numbered points within a sentence will not cause a split 1. Like this one. + See the code for all the rules that apply. + This string has 7 sentences. + }; + + my $sentences=get_sentences($text); # Get the sentences. + foreach my $sent (@$sentences) + { + $i++; + print("SENTENCE $i:$sent\n"); } @@ -31,18 +47,32 @@ segmentations. But some of them are already integrated into this code and are being taken care of. Still, if you see that there are words causing the get_sentences function to fail, you can add those to the module, so it notices them. +Note that abbreviations are case sensitive, so 'Mrs.' is recognised but not 'mrs.' =head1 ALGORITHM +The first step is to mark the dot ending an abbreviation by changing it to a special +character. Now it won't cause a sentence split. The original dot is restored after +the sentences are split + Basically, I use a 'brute' regular expression to split the text into sentences. (Well, nothing is yet split - I just mark the end-of-sentence). Then I look into a set of rules which decide when an end-of-sentence is justified and when it's a -mistake. In case of a mistake, the end-of-sentence mark is removed. +mistake. In case of a mistake, the end-of-sentence mark is removed. What are +such mistakes? -What are such mistakes? Cases of abbreviations, for example. I have a list of -such abbreviations (Please see public globals belwo for a list), and more -general rules (for example, the abbreviations 'i.e.' and '.e.g.' need not to be -in the list as a special rule takes care of all single letter abbreviations). +Letter-dot sequences: U.S.A. , i.e. , e.g. +Dot sequences: '..' or '...' or 'text . . more text' +Two carriage returns denote the end of a sentence even if it doesn't end with a dot + +=head1 LIMITATIONS + +1) John F. Kennedy was a former president +2) The answer is F. That ends the quiz + +In the first sentence, F. is detected as a persons initial and not the end of a sentence. +But this means we cannot detect the true end of sentence 2, which is after the 'F'. This +case is not common though. =head1 FUNCTIONS @@ -133,6 +163,8 @@ =head1 SEE ALSO Text::Sentence + Lingua::Sentence + Raku port of Lingua::EN::Sentence =head1 REPOSITORY @@ -148,7 +180,7 @@ =head1 COPYRIGHT AND LICENSE Copyright (c) 2001-2016 Shlomo Yona. All rights reserved. -Copyright (c) 2018 Kim Ryan. All rights reserved. +Copyright (c) 2022 Kim Ryan. All rights reserved. This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. @@ -187,7 +219,7 @@ use Carp qw/cluck/; use English; -our $VERSION = '0.31'; +our $VERSION = '0.33'; our $LOC; if ($OSNAME ne 'android') { @@ -201,34 +233,36 @@ @ISA = qw( Exporter ); @EXPORT_OK = qw( get_sentences add_acronyms get_acronyms set_acronyms get_EOS set_EOS set_locale); -our $EOS="\001"; +our $VERBOSE = 0; # echo intermediate data transforms, useful for debugging +our $EOS = "\001"; #"__EOS__"; +our $EOA = '__EOA__'; our $P = q/[\.!?]/; # PUNCTUATION $AP = q/(?:'|"|\?|\)|\]|\})?/; # AFTER PUNCTUATION our $PAP = $P.$AP; -my @PEOPLE = qw( mr mrs ms dr prof mme ms?gr sens? reps? gov attys? supt insp const det revd? ald rt hon); -my @TITLE_SUFFIXES = qw(PhD jn?r sn?r esq md llb); -my @MILITARY = qw( col gen lt cdr cmdr adm capt sgt cpl maj pte); -my @INSTITUTES = qw( dept univ assn bros); -my @COMPANIES = qw( inc ltd co corp); +# ACRONYMS AND ABBREVIATIONS +my @PEOPLE = qw( Mr Mrs Ms Dr Prof Mme Ms?gr Sens? Reps? Gov Attys? Supt Insp Const Det Revd? Ald Rt Hon); +my @TITLE_SUFFIXES = qw(PhD Jn?r Sn?r Esq MD LLB); +my @MILITARY = qw( Col Gen Lt Cm?dr Adm Capt Sgt Cpl Maj Pte); +my @INSTITUTES = qw( Dept Univ Assn Bros); +my @COMPANIES = qw( Inc Pty Ltd Co Corp); my @PLACES = qw( - arc al ave blv?d cl ct cres dr expy? fw?y hwa?y la pde? pl plz rd st tce + Arc Al Ave Blv?d Cl Ct Cres Dr Expy? Fw?y Hwa?y La Pde? Pl Plz Rd St Tce dist mt km in ft Ala Ariz Ark Cal Calif Col Colo Conn Del Fed Fla Ga Ida Id Ill Ind Ia Kan Kans Ken Ky - La Me Md Is Mass Mich Minn Miss Mo Mont Neb Nebr Nev Mex Okla Ok Ore Penna Penn Pa Dak - Tenn Tex Ut Vt Va Wash Wis Wisc Wy Wyo USAFA Alta Man Ont Qu? Sask Yuk + La Me Md Is Mass Mich Minn Miss Mo Mont Neb Nebr Nev Mex Okla Ok Ore Penna Penn Pa Dak + Tenn Tex Ut Vt Va Wash Wis Wisc Wy Wyo USAFA Alta Man Ont Qu? Sask Yuk Aust Vic Qld Tas ); -my @MONTHS = qw(jan feb mar apr may jun jul aug sep sept oct nov dec); +my @MONTHS = qw(Jan Feb Mar Apr May Jun Jul Aug Sept? Oct Nov Dec); my @MISC = qw(no esp est); # Established my @LATIN = qw(vs etc al ibid sic); +my @MATH = qw(fig eq sec cf Thm Def Conj resp); -our @ABBREVIATIONS = (@PEOPLE, @TITLE_SUFFIXES, @MILITARY, @INSTITUTES, @COMPANIES, @PLACES, @MONTHS, @MISC, @LATIN ); -my $abbreviation_regex; -_set_abbreviations_regex(); +our @ABBREVIATIONS = (@PEOPLE, @TITLE_SUFFIXES, @MILITARY, @INSTITUTES, @COMPANIES, @PLACES, @MONTHS, @MISC,@LATIN, @MATH); #============================================================================== @@ -245,13 +279,31 @@ # places which are not indeed end-of-sentence. #------------------------------------------------------------------------------ sub get_sentences { - my ($text)=@_; + my ($text) = @_; return [] unless defined $text; - my $marked_text = first_sentence_breaking($text); - my $fixed_marked_text = remove_false_end_of_sentence($marked_text); - $fixed_marked_text = split_unsplit_stuff($fixed_marked_text); - my @sentences = split(/$EOS/,$fixed_marked_text); + $VERBOSE and print("ORIGINAL\n$text\n"); + + $text = mark_up_abbreviations($text); + $VERBOSE and print("mark_up_abbreviations\n$text\n"); + + $text = first_sentence_breaking($text); + $VERBOSE and print("first_sentence_breaking\n$text\n"); + + $text = remove_false_end_of_sentence($text); + $VERBOSE and print("remove_false_end_of_sentence\n$text\n"); + + $text = split_unsplit_stuff($text); + $VERBOSE and print("split_unsplit_stuff\n$text\n"); + + my @sentences = split(/$EOS/,$text); my $cleaned_sentences = clean_sentences(\@sentences); + if ($VERBOSE) { + my $i; + foreach my $sent (@$cleaned_sentences) { + $i++; + print("SENTENCE $i >>>$sent<<<\n"); + } + } return $cleaned_sentences; } @@ -260,7 +312,6 @@ #------------------------------------------------------------------------------ sub add_acronyms { push @ABBREVIATIONS, @_; - _set_abbreviations_regex(); } #------------------------------------------------------------------------------ @@ -275,7 +326,6 @@ #------------------------------------------------------------------------------ sub set_acronyms { @ABBREVIATIONS=@_; - _set_abbreviations_regex(); } #------------------------------------------------------------------------------ @@ -295,7 +345,6 @@ return $EOS; } $EOS = $new_EOS; - _set_abbreviations_regex(); return $EOS; } @@ -341,24 +390,19 @@ } } - #============================================================================== # # Private methods # #============================================================================== -# save some time by pre-compiling a regex used for working with abbreviations -sub _set_abbreviations_regex { - my $abbreviations = join '|', @ABBREVIATIONS; - $abbreviation_regex = qr[(\b(?:$abbreviations)$PAP\s)$EOS]is; - return; -} ## Please email me any suggestions for optimizing these RegExps. sub remove_false_end_of_sentence { my ($marked_segment) = @_; -## ## don't do u.s.a. + + +## ## don't do U.S.A., U.K. ## $marked_segment=~s/(\.\w$PAP)$EOS/$1/sg; $marked_segment=~s/([^-\w]\w$PAP\s)$EOS/$1/sg; $marked_segment=~s/([^-\w]\w$P)$EOS/$1/sg; @@ -368,13 +412,12 @@ # Note: will fail for 12. Point 12 $marked_segment=~s/(\s[\w\d]\.\s+)$EOS/$1/sg; - # fix: bla bla... yada yada - $marked_segment=~s/(\.\.\. )$EOS([[:lower:]])/$1$2/sg; - # fix "." "?" "!" - $marked_segment=~s/(['"]$P['"]\s+)$EOS/$1/sg; - ## fix where abbreviations exist - $marked_segment=~s/$abbreviation_regex/$1/g; + # fix ellipsis: bla bla... yada yada + $marked_segment=~s/(\.\.\. )$EOS([[:lower:]])/$1$2/sg; + # fix quoted EOS such as "." "?" "!" + $marked_segment=~s/(['"]$P['"]\s+)$EOS/$1/sg; + # don't break after quote unless its a capital letter. $marked_segment=~s/(["']\s*)$EOS(\s*[[:lower:]])/$1$2/sg; @@ -382,34 +425,42 @@ $marked_segment=~s/(\s\.\s)$EOS(\s*)/$1$2/sg; $marked_segment=~s/(["']\s*)$EOS(\s*[[:lower:]])/$1$2/sg; - $marked_segment=~s/(\s$PAP\s)$EOS/$1/sg; + return $marked_segment; } sub split_unsplit_stuff { my ($text) = @_; + # $text=~s/(\D\d+)($P)(\s+)/$1$2$EOS$3/sg; # breaks numbered points, such as {EOL}1. point one - $text=~s/([\w $P]\d)($P)(\s+)/$1$2$EOS$3/sg; - $text=~s/($PAP\s)(\s*\()/$1$EOS$2/gs; + $text=~s/([\w $P]\d)($P)(\s+)/$1$2$EOS$3/sg; + + # eg 'end. (' -> 'end. $EOS (' + $text=~s/($PAP\s)(\s*\()/$1$EOS$2/gs; # open bracket + $text=~s/('\w$P)(\s)/$1$EOS$2/gs; $text=~s/(\sno\.)(\s+)(?!\d)/$1$EOS$2/gis; -## # split where single capital letter followed by dot makes sense to break. -## # notice these are exceptions to the general rule NOT to split on single -## # letter. -## # notice also that sibgle letter M is missing here, due to French 'mister' -## # which is represented as M. -## # -## # the rule will not split on names begining or containing -## # single capital letter dot in the first or second name -## # assuming 2 or three word name. -## $text=~s/(\s[[:lower:]]\w+\s+[^[[:^upper:]M]\.)(?!\s+[[:upper:]]\.)/$1$EOS/sg; - + # split where single capital letter followed by dot makes sense to break. + # notice these are exceptions to the general rule NOT to split on single + # letter. + # notice also that single letter M is missing here, due to French 'mister' + # which is represented as M. + # + # the rule will not split on names beginning or containing + # single capital letter dot in the first or second name + # assuming 2 or three word name. + + # NOT WORKING , it breaks up U.S.A. after U. + # Valid cases if single letter thrn dot are rare, such as 'The answer is F'. + # Can't decipher meaning of this regex + # $text=~s/(\s[[:lower:]]\w+\s+[^[[:^upper:]M]\.)(?!\s+[[:upper:]]\.)/$1$EOS/sg; + # add EOS when you see "a.m." or "p.m." followed by a capital letter. $text=~s/([ap]\.m\.\s+)([[:upper:]])/$1$EOS$2/gs; @@ -426,16 +477,39 @@ $s=~s/^\s*//; $s=~s/\s*$//; ## $s=~s/\s+/ /g; + # replace end of abbrev marker with a dot + $s=~s/$EOA/\./g; push @$cleaned_sentences,$s; } return $cleaned_sentences; } +# Replace seuence such as Mr. A. Smith Jnr. with Mr__EOA__ A__EOA__ etc +# This simplifies the code that detects end of sentnees. The marker is +# replaced with the original dot adter sentence slitting + +sub mark_up_abbreviations { + my ($text) = @_; + + my %found_abbrevs; + foreach my $abbrev (@ABBREVIATIONS) { + if ($text=~/\b($abbrev)\./i) { + $found_abbrevs{$abbrev} = 1; + } + } + + foreach my $abbrev (keys %found_abbrevs) { + $text=~s/($abbrev)\./$1$EOA/gs; + } + + return $text; +} + sub first_sentence_breaking { my ($text) = @_; $text=~s/\n\s*\n/$EOS/gs; ## double new-line means a different sentence. $text=~s/($PAP\s)/$1$EOS/gs; - $text=~s/(\s\w$P)/$1$EOS/gs; # breake also when single letter comes before punc. + $text=~s/(\s\w$P)/$1$EOS/gs; # break also when single letter comes before punc. return $text; } diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/Lingua-EN-Sentence-0.31/t/main.t new/Lingua-EN-Sentence-0.33/t/main.t --- old/Lingua-EN-Sentence-0.31/t/main.t 2016-08-08 08:40:41.000000000 +0200 +++ new/Lingua-EN-Sentence-0.33/t/main.t 2022-07-04 08:07:56.000000000 +0200 @@ -17,20 +17,25 @@ use Lingua::EN::Sentence qw( get_sentences add_acronyms get_acronyms); my $par = q{ -Returns the number of sentences in string. -A sentence ends with a dot, exclamation or question mark followed by a space! -Dots after single letters such as U.S.A or e.g. are ignored, - as well as common abbreviations such as Dr. Ms. esp. Apr. Calif. and Ave., - initials such as 'Mr. A. Smith'. -This string has 4 sentences. +A sentence usually ends with a dot, exclamation or question mark optionally followed by a space! +A string followed by 2 carriage returns denotes a sentence, even though it doesn't end in a dot + +Dots after single letters such as U.S.A. or in numbers like -12.34 will not cause a split +as well as common abbreviations such as Dr. I. Smith, Ms. A.B. Jones, Apr. Calif. Esq. +and (some text) ellipsis such as ... or . . are ignored. +Some valid cases canot be deteected, such as the answer is X. It cannot easily be +differentiated from the single letter-dot sequence to abbreviate a person's given name. +Numbered points within a sentence will not cause a split 1. Like this one. +See the code for all the rules that apply. +This string has 7 sentences. }; my $sentences=get_sentences($par); -is( @$sentences, 4,'sub sentence_count'); +is( @$sentences, 7,'sub sentence_count'); -$par .= 'Now add an acronym, such as ret. for retired.'; +$par .= 'Now add an acronym, such as Ret. for retired.'; add_acronyms('Ret'); $sentences=get_sentences($par); -is( @$sentences, 5,'sub add_acronyms'); +is( @$sentences, 8,'sub add_acronyms');