In perl.git, the branch blead has been updated <https://perl5.git.perl.org/perl.git/commitdiff/966b4e4752e107a969ce19fbdbdb819547d41137?hp=de59f38ed9c2eb88d97eed5f7ade475479bc3248>
- Log ----------------------------------------------------------------- commit 966b4e4752e107a969ce19fbdbdb819547d41137 Author: Karl Williamson <[email protected]> Date: Mon Feb 18 17:57:11 2019 -0700 perlop: Improve documentation for (mostly) tr/// This adds examples and clarifications commit 0a142f463c08e1bf0466cee9a0f896e3d11e7dbf Author: Karl Williamson <[email protected]> Date: Mon Mar 4 10:30:38 2019 -0700 ebcdic_tables.h: Remove alien '#' These were introduced in c05125c57fd7868af65366bacb6fe40c04b1c719 in July 2018, and would cause any EBCDIC compilations to fail. That I found it by code inspection shows that we've lost all our EBCDIC smokers again. commit 635ff1f923d510fc8356bff054b96cbd97d93bf3 Author: Karl Williamson <[email protected]> Date: Mon Mar 4 10:20:35 2019 -0700 regen/ebcdic.pl: Move code to function This is for eventual use in being called from more than one place. ----------------------------------------------------------------------- Summary of changes: ebcdic_tables.h | 96 +++++++++++++++++----------------- pod/perlop.pod | 159 ++++++++++++++++++++++++++++++++++++-------------------- regen/ebcdic.pl | 26 ++++++--- 3 files changed, 170 insertions(+), 111 deletions(-) diff --git a/ebcdic_tables.h b/ebcdic_tables.h index 103e10ef90..99f533bae7 100644 --- a/ebcdic_tables.h +++ b/ebcdic_tables.h @@ -44,9 +44,9 @@ SOFTWARE. /* Index is ASCII platform code point; value is EBCDIC 1047 equivalent */ # ifndef DOINIT -# EXTCONST U8 PL_a2e[]; + EXTCONST U8 PL_a2e[]; # else -# EXTCONST U8 PL_a2e[] = { + EXTCONST U8 PL_a2e[] = { /* _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F*/ /*0_*/0x00,0x01,0x02,0x03,0x37,0x2D,0x2E,0x2F,0x16,0x05,0x15,0x0B,0x0C,0x0D,0x0E,0x0F, /*1_*/0x10,0x11,0x12,0x13,0x3C,0x3D,0x32,0x26,0x18,0x19,0x3F,0x27,0x1C,0x1D,0x1E,0x1F, @@ -70,9 +70,9 @@ SOFTWARE. /* Index is EBCDIC 1047 code point; value is ASCII platform equivalent */ # ifndef DOINIT -# EXTCONST U8 PL_e2a[]; + EXTCONST U8 PL_e2a[]; # else -# EXTCONST U8 PL_e2a[] = { + EXTCONST U8 PL_e2a[] = { /* _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F*/ /*0_*/0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F, /*1_*/0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F, @@ -97,9 +97,9 @@ SOFTWARE. /* (Confusingly named) Index is EBCDIC 1047 I8 byte; value is * EBCDIC 1047 UTF-EBCDIC equivalent */ # ifndef DOINIT -# EXTCONST U8 PL_utf2e[]; + EXTCONST U8 PL_utf2e[]; # else -# EXTCONST U8 PL_utf2e[] = { + EXTCONST U8 PL_utf2e[] = { /* _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F*/ /*0_*/0x00,0x01,0x02,0x03,0x37,0x2D,0x2E,0x2F,0x16,0x05,0x15,0x0B,0x0C,0x0D,0x0E,0x0F, /*1_*/0x10,0x11,0x12,0x13,0x3C,0x3D,0x32,0x26,0x18,0x19,0x3F,0x27,0x1C,0x1D,0x1E,0x1F, @@ -124,9 +124,9 @@ SOFTWARE. /* (Confusingly named) Index is EBCDIC 1047 UTF-EBCDIC byte; value is * EBCDIC 1047 I8 equivalent */ # ifndef DOINIT -# EXTCONST U8 PL_e2utf[]; + EXTCONST U8 PL_e2utf[]; # else -# EXTCONST U8 PL_e2utf[] = { + EXTCONST U8 PL_e2utf[] = { /* _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F*/ /*0_*/0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F, /*1_*/0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F, @@ -153,9 +153,9 @@ SOFTWARE. * flags table in tr16. The entries marked 9 in tr16 are continuation bytes * and are marked as length 1 here so that we can recover. */ # ifndef DOINIT -# EXTCONST U8 PL_utf8skip[]; + EXTCONST U8 PL_utf8skip[]; # else -# EXTCONST U8 PL_utf8skip[] = { + EXTCONST U8 PL_utf8skip[] = { /* _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F*/ /*0_*/ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*1_*/ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -179,9 +179,9 @@ SOFTWARE. /* Index is EBCDIC 1047 code point; value is its lowercase equivalent */ # ifndef DOINIT -# EXTCONST U8 PL_latin1_lc[]; + EXTCONST U8 PL_latin1_lc[]; # else -# EXTCONST U8 PL_latin1_lc[] = { + EXTCONST U8 PL_latin1_lc[] = { /* _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F*/ /*0_*/0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F, /*1_*/0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F, @@ -207,9 +207,9 @@ SOFTWARE. * The 'mod' in the name means that codepoints whose uppercase is above 255 or * longer than 1 character map to LATIN SMALL LETTER Y WITH DIARESIS */ # ifndef DOINIT -# EXTCONST U8 PL_mod_latin1_uc[]; + EXTCONST U8 PL_mod_latin1_uc[]; # else -# EXTCONST U8 PL_mod_latin1_uc[] = { + EXTCONST U8 PL_mod_latin1_uc[] = { /* _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F*/ /*0_*/0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F, /*1_*/0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F, @@ -234,9 +234,9 @@ SOFTWARE. /* Index is EBCDIC 1047 code point; For A-Z, value is a-z; for a-z, value * is A-Z; all other code points map to themselves */ # ifndef DOINIT -# EXTCONST U8 PL_fold[]; + EXTCONST U8 PL_fold[]; # else -# EXTCONST U8 PL_fold[] = { + EXTCONST U8 PL_fold[] = { /* _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F*/ /*0_*/0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F, /*1_*/0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F, @@ -262,9 +262,9 @@ SOFTWARE. * (A => a; a => A, etc) in the 0-255 range. If no such equivalent, value is * the code point itself */ # ifndef DOINIT -# EXTCONST U8 PL_fold_latin1[]; + EXTCONST U8 PL_fold_latin1[]; # else -# EXTCONST U8 PL_fold_latin1[] = { + EXTCONST U8 PL_fold_latin1[] = { /* _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F*/ /*0_*/0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F, /*1_*/0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F, @@ -293,9 +293,9 @@ SOFTWARE. */ # ifndef DOINIT -# EXTCONST U8 PL_extended_utf8_dfa_tab[]; + EXTCONST U8 PL_extended_utf8_dfa_tab[]; # else -# EXTCONST U8 PL_extended_utf8_dfa_tab[] = { + EXTCONST U8 PL_extended_utf8_dfa_tab[] = { /* _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F*/ /*0_ */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*1_ */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -334,9 +334,9 @@ SOFTWARE. */ # ifndef DOINIT -# EXTCONST U16 PL_strict_utf8_dfa_tab[]; + EXTCONST U16 PL_strict_utf8_dfa_tab[]; # else -# EXTCONST U16 PL_strict_utf8_dfa_tab[] = { + EXTCONST U16 PL_strict_utf8_dfa_tab[] = { /* _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F*/ /*0_ */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*1_ */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -380,9 +380,9 @@ SOFTWARE. */ # ifndef DOINIT -# EXTCONST U8 PL_c9_utf8_dfa_tab[]; + EXTCONST U8 PL_c9_utf8_dfa_tab[]; # else -# EXTCONST U8 PL_c9_utf8_dfa_tab[] = { + EXTCONST U8 PL_c9_utf8_dfa_tab[] = { /* _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F*/ /*0_ */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*1_ */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -421,9 +421,9 @@ SOFTWARE. /* Index is ASCII platform code point; value is EBCDIC 037 equivalent */ # ifndef DOINIT -# EXTCONST U8 PL_a2e[]; + EXTCONST U8 PL_a2e[]; # else -# EXTCONST U8 PL_a2e[] = { + EXTCONST U8 PL_a2e[] = { /* _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F*/ /*0_*/0x00,0x01,0x02,0x03,0x37,0x2D,0x2E,0x2F,0x16,0x05,0x25,0x0B,0x0C,0x0D,0x0E,0x0F, /*1_*/0x10,0x11,0x12,0x13,0x3C,0x3D,0x32,0x26,0x18,0x19,0x3F,0x27,0x1C,0x1D,0x1E,0x1F, @@ -447,9 +447,9 @@ SOFTWARE. /* Index is EBCDIC 037 code point; value is ASCII platform equivalent */ # ifndef DOINIT -# EXTCONST U8 PL_e2a[]; + EXTCONST U8 PL_e2a[]; # else -# EXTCONST U8 PL_e2a[] = { + EXTCONST U8 PL_e2a[] = { /* _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F*/ /*0_*/0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F, /*1_*/0x10,0x11,0x12,0x13,0x9D,0x85,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F, @@ -474,9 +474,9 @@ SOFTWARE. /* (Confusingly named) Index is EBCDIC 037 I8 byte; value is * EBCDIC 037 UTF-EBCDIC equivalent */ # ifndef DOINIT -# EXTCONST U8 PL_utf2e[]; + EXTCONST U8 PL_utf2e[]; # else -# EXTCONST U8 PL_utf2e[] = { + EXTCONST U8 PL_utf2e[] = { /* _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F*/ /*0_*/0x00,0x01,0x02,0x03,0x37,0x2D,0x2E,0x2F,0x16,0x05,0x25,0x0B,0x0C,0x0D,0x0E,0x0F, /*1_*/0x10,0x11,0x12,0x13,0x3C,0x3D,0x32,0x26,0x18,0x19,0x3F,0x27,0x1C,0x1D,0x1E,0x1F, @@ -501,9 +501,9 @@ SOFTWARE. /* (Confusingly named) Index is EBCDIC 037 UTF-EBCDIC byte; value is * EBCDIC 037 I8 equivalent */ # ifndef DOINIT -# EXTCONST U8 PL_e2utf[]; + EXTCONST U8 PL_e2utf[]; # else -# EXTCONST U8 PL_e2utf[] = { + EXTCONST U8 PL_e2utf[] = { /* _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F*/ /*0_*/0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F, /*1_*/0x10,0x11,0x12,0x13,0x9D,0x85,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F, @@ -530,9 +530,9 @@ SOFTWARE. * flags table in tr16. The entries marked 9 in tr16 are continuation bytes * and are marked as length 1 here so that we can recover. */ # ifndef DOINIT -# EXTCONST U8 PL_utf8skip[]; + EXTCONST U8 PL_utf8skip[]; # else -# EXTCONST U8 PL_utf8skip[] = { + EXTCONST U8 PL_utf8skip[] = { /* _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F*/ /*0_*/ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*1_*/ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -556,9 +556,9 @@ SOFTWARE. /* Index is EBCDIC 037 code point; value is its lowercase equivalent */ # ifndef DOINIT -# EXTCONST U8 PL_latin1_lc[]; + EXTCONST U8 PL_latin1_lc[]; # else -# EXTCONST U8 PL_latin1_lc[] = { + EXTCONST U8 PL_latin1_lc[] = { /* _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F*/ /*0_*/0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F, /*1_*/0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F, @@ -584,9 +584,9 @@ SOFTWARE. * The 'mod' in the name means that codepoints whose uppercase is above 255 or * longer than 1 character map to LATIN SMALL LETTER Y WITH DIARESIS */ # ifndef DOINIT -# EXTCONST U8 PL_mod_latin1_uc[]; + EXTCONST U8 PL_mod_latin1_uc[]; # else -# EXTCONST U8 PL_mod_latin1_uc[] = { + EXTCONST U8 PL_mod_latin1_uc[] = { /* _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F*/ /*0_*/0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F, /*1_*/0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F, @@ -611,9 +611,9 @@ SOFTWARE. /* Index is EBCDIC 037 code point; For A-Z, value is a-z; for a-z, value * is A-Z; all other code points map to themselves */ # ifndef DOINIT -# EXTCONST U8 PL_fold[]; + EXTCONST U8 PL_fold[]; # else -# EXTCONST U8 PL_fold[] = { + EXTCONST U8 PL_fold[] = { /* _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F*/ /*0_*/0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F, /*1_*/0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F, @@ -639,9 +639,9 @@ SOFTWARE. * (A => a; a => A, etc) in the 0-255 range. If no such equivalent, value is * the code point itself */ # ifndef DOINIT -# EXTCONST U8 PL_fold_latin1[]; + EXTCONST U8 PL_fold_latin1[]; # else -# EXTCONST U8 PL_fold_latin1[] = { + EXTCONST U8 PL_fold_latin1[] = { /* _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F*/ /*0_*/0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F, /*1_*/0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F, @@ -670,9 +670,9 @@ SOFTWARE. */ # ifndef DOINIT -# EXTCONST U8 PL_extended_utf8_dfa_tab[]; + EXTCONST U8 PL_extended_utf8_dfa_tab[]; # else -# EXTCONST U8 PL_extended_utf8_dfa_tab[] = { + EXTCONST U8 PL_extended_utf8_dfa_tab[] = { /* _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F*/ /*0_ */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*1_ */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -711,9 +711,9 @@ SOFTWARE. */ # ifndef DOINIT -# EXTCONST U16 PL_strict_utf8_dfa_tab[]; + EXTCONST U16 PL_strict_utf8_dfa_tab[]; # else -# EXTCONST U16 PL_strict_utf8_dfa_tab[] = { + EXTCONST U16 PL_strict_utf8_dfa_tab[] = { /* _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F*/ /*0_ */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*1_ */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -757,9 +757,9 @@ SOFTWARE. */ # ifndef DOINIT -# EXTCONST U8 PL_c9_utf8_dfa_tab[]; + EXTCONST U8 PL_c9_utf8_dfa_tab[]; # else -# EXTCONST U8 PL_c9_utf8_dfa_tab[] = { + EXTCONST U8 PL_c9_utf8_dfa_tab[] = { /* _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F*/ /*0_ */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*1_ */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, diff --git a/pod/perlop.pod b/pod/perlop.pod index af695b678f..dd658bf5fb 100644 --- a/pod/perlop.pod +++ b/pod/perlop.pod @@ -2211,6 +2211,10 @@ Examples: s/([^ ]*) *([^ ]*)/$2 $1/; # reverse 1st two fields + $foo !~ s/A/a/g; # Lowercase all A's in $foo; return + # 0 if any were found and changed; + # otherwise return 1 + Note the use of C<$> instead of C<\> in the last example. Unlike B<sed>, we use the \<I<digit>> form only in the left hand side. Anywhere else it's $<I<digit>>. @@ -2405,10 +2409,14 @@ X<tr> X<y> X<transliterate> X</c> X</d> X</s> =item C<y/I<SEARCHLIST>/I<REPLACEMENTLIST>/cdsr> -Transliterates all occurrences of the characters found in the search list -with the corresponding character in the replacement list. It returns -the number of characters replaced or deleted. If no string is -specified via the C<=~> or C<!~> operator, the C<$_> string is transliterated. +Transliterates all occurrences of the characters found (or not found +if the C</c> modifier is specified) in the search list with the +positionally corresponding character in the replacement list, possibly +deleting some, depending on the modifiers specified. It returns the +number of characters replaced or deleted. If no string is specified via +the C<=~> or C<!~> operator, the C<$_> string is transliterated. + +For B<sed> devotees, C<y> is provided as a synonym for C<tr>. If the C</r> (non-destructive) option is present, a new copy of the string is made and its characters transliterated, and this copy is returned no @@ -2428,20 +2436,18 @@ Otherwise, a character range may be specified with a hyphen, so C<tr/A-J/0-9/> does the same replacement as C<tr/ACEGIBDFHJ/0246813579/>. -For B<sed> devotees, C<y> is provided as a synonym for C<tr>. - If the I<SEARCHLIST> is delimited by bracketing quotes, the I<REPLACEMENTLIST> must have its own pair of quotes, which may or may not be bracketing quotes; for example, C<tr[aeiouy][yuoiea]> or C<tr(+\-*/)/ABCD/>. -Characters may be literals or (if the delimiters aren't single quotes) +Characters may be literals, or (if the delimiters aren't single quotes) any of the escape sequences accepted in double-quoted strings. But there is never any variable interpolation, so C<"$"> and C<"@"> are -treated as literals. A hyphen at the beginning or end, or preceded by a -backslash is considered a literal. Escape sequence details are in L<the -table near the beginning of this section|/Quote and Quote-like -Operators>. +always treated as literals. A hyphen at the beginning or end, or +preceded by a backslash is also always considered a literal. Escape +sequence details are in L<the table near the beginning of this +section|/Quote and Quote-like Operators>. Note that C<tr> does B<not> do regular expression character classes such as C<\d> or C<\pL>. The C<tr> operator is not equivalent to the C<L<tr(1)>> @@ -2480,85 +2486,128 @@ range's end points are expressed as C<\N{...}> removes from C<$string> all the platform's characters which are equivalent to any of Unicode U+0020, U+0021, ... U+007D, U+007E. This is a portable range, and has the same effect on every platform it is -run on. It turns out that in this example, these are the ASCII +run on. In this example, these are the ASCII printable characters. So after this is run, C<$string> has only controls and characters which have no ASCII equivalents. But, even for portable ranges, it is not generally obvious what is -included without having to look things up. A sound principle is to use -only ranges that both begin from and end at either ASCII alphabetics of -equal case (C<b-e>, C<B-E>), or digits (C<1-4>). Anything else is -unclear (and unportable unless C<\N{...}> is used). If in doubt, spell -out the character sets in full. +included without having to look things up in the manual. A sound +principle is to use only ranges that both begin from, and end at, either +ASCII alphabetics of equal case (C<b-e>, C<B-E>), or digits (C<1-4>). +Anything else is unclear (and unportable unless C<\N{...}> is used). If +in doubt, spell out the character sets in full. Options: c Complement the SEARCHLIST. d Delete found but unreplaced characters. - s Squash duplicate replaced characters. r Return the modified string and leave the original string untouched. + s Squash duplicate replaced characters. -If the C</c> modifier is specified, the I<SEARCHLIST> character set -is complemented. So for example these two are equivalent (the exact -maximum number will depend on your platform): - - tr/\x00-\xfd/ABCD/c - tr/\xfe-\x{7fffffff}/ABCD/ +If the C</d> modifier is specified, any characters specified by +I<SEARCHLIST> not found in I<REPLACEMENTLIST> are deleted. (Note that +this is slightly more flexible than the behavior of some B<tr> programs, +which delete anything they find in the I<SEARCHLIST>, period.) -If the C</d> modifier is specified, any characters -specified by I<SEARCHLIST> not found in I<REPLACEMENTLIST> are deleted. -(Note that this is slightly more flexible than the behavior of some -B<tr> programs, which delete anything they find in the I<SEARCHLIST>, -period.) +If the C</s> modifier is specified, sequences of characters, all in a +row, that were transliterated to the same character are squashed down to +a single instance of that character. -If the C</s> modifier is specified, runs of the same character in the -result, where each those characters were substituted by the -transliteration, are squashed down to a single instance of the character. + my $a = "aaaba" + $a =~ tr/a/a/s # $a now is "aba" If the C</d> modifier is used, the I<REPLACEMENTLIST> is always interpreted exactly as specified. Otherwise, if the I<REPLACEMENTLIST> is shorter -than the I<SEARCHLIST>, the final character is replicated till it is long -enough. If the I<REPLACEMENTLIST> is empty, the I<SEARCHLIST> is replicated. -This latter is useful for counting characters in a class or for -squashing character sequences in a class. For example, each of these pairs -are equivalent: +than the I<SEARCHLIST>, the final character, if any, is replicated until +it is long enough. There won't be a final character if and only if the +I<REPLACEMENTLIST> is empty, in which case I<REPLACEMENTLIST> is +copied from I<SEARCHLIST>. An empty I<REPLACEMENTLIST> is useful +for counting characters in a class, or for squashing character sequences +in a class. tr/abcd// tr/abcd/abcd/ tr/abcd/AB/ tr/abcd/ABBB/ tr/abcd//d s/[abcd]//g tr/abcd/AB/d (tr/ab/AB/ + s/[cd]//g) - but run together +If the C</c> modifier is specified, the characters to be transliterated +are the ones NOT in I<SEARCHLIST>, that is, it is complemented. If +C</d> and/or C</s> are also specified, they apply to the complemented +I<SEARCHLIST>. Recall, that if I<REPLACEMENTLIST> is empty (except +under C</d>) a copy of I<SEARCHLIST> is used instead. That copy is made +after complementing under C</c>. I<SEARCHLIST> is sorted by code point +order after complementing, and any I<REPLACEMENTLIST> is applied to +that sorted result. This means that under C</c>, the order of the +characters specified in I<SEARCHLIST> is irrelevant. This can +lead to different results on EBCDIC systems if I<REPLACEMENTLIST> +contains more than one character, hence it is generally non-portable to +use C</c> with such a I<REPLACEMENTLIST>. + +Another way of describing the operation is this: +If C</c> is specified, the I<SEARCHLIST> is sorted by code point order, +then complemented. If I<REPLACEMENTLIST> is empty and C</d> is not +specified, I<REPLACEMENTLIST> is replaced by a copy of I<SEARCHLIST> (as +modified under C</c>), and these potentially modified lists are used as +the basis for what follows. Any character in the target string that +isn't in I<SEARCHLIST> is passed through unchanged. Every other +character in the target string is replaced by the character in +I<REPLACEMENTLIST> that positionally corresponds to its mate in +I<SEARCHLIST>, except that under C</s>, the 2nd and following characters +are squeezed out in a sequence of characters in a row that all translate +to the same character. If I<SEARCHLIST> is longer than +I<REPLACEMENTLIST>, characters in the target string that match a +character in I<SEARCHLIST> that doesn't have a correspondence in +I<REPLACEMENTLIST> are either deleted from the target string if C</d> is +specified; or replaced by the final character in I<REPLACEMENTLIST> if +C</d> isn't specified. + Some examples: - $ARGV[1] =~ tr/A-Z/a-z/; # canonicalize to lower case ASCII + $ARGV[1] =~ tr/A-Z/a-z/; # canonicalize to lower case ASCII + + $cnt = tr/*/*/; # count the stars in $_ + $cnt = tr/*//; # same thing + + $cnt = $sky =~ tr/*/*/; # count the stars in $sky + $cnt = $sky =~ tr/*//; # same thing - $cnt = tr/*/*/; # count the stars in $_ + $cnt = $sky =~ tr/*//c; # count all the non-stars in $sky + $cnt = $sky =~ tr/*/*/c; # same, but transliterate each non-star + # into a star, leaving the already-stars + # alone. Afterwards, everything in $sky + # is a star. - $cnt = $sky =~ tr/*/*/; # count the stars in $sky + $cnt = tr/0-9//; # count the ASCII digits in $_ - $cnt = tr/0-9//; # count the digits in $_ + tr/a-zA-Z//s; # bookkeeper -> bokeper + tr/o/o/s; # bookkeeper -> bokkeeper + tr/oe/oe/s; # bookkeeper -> bokkeper + tr/oe//s; # bookkeeper -> bokkeper + tr/oe/o/s; # bookkeeper -> bokkopor - tr/a-zA-Z//s; # bookkeeper -> bokeper + ($HOST = $host) =~ tr/a-z/A-Z/; + $HOST = $host =~ tr/a-z/A-Z/r; # same thing - ($HOST = $host) =~ tr/a-z/A-Z/; - $HOST = $host =~ tr/a-z/A-Z/r; # same thing + $HOST = $host =~ tr/a-z/A-Z/r # chained with s///r + =~ s/:/ -p/r; - $HOST = $host =~ tr/a-z/A-Z/r # chained with s///r - =~ s/:/ -p/r; + tr/a-zA-Z/ /cs; # change non-alphas to single space - tr/a-zA-Z/ /cs; # change non-alphas to single space + @stripped = map tr/a-zA-Z/ /csr, @original; + # /r with map - @stripped = map tr/a-zA-Z/ /csr, @original; - # /r with map + tr [\200-\377] + [\000-\177]; # wickedly delete 8th bit - tr [\200-\377] - [\000-\177]; # wickedly delete 8th bit + $foo !~ tr/A/a/ # transliterate all the A's in $foo to 'a', + # return 0 if any were found and changed. + # Otherwise return 1 If multiple transliterations are given for a character, only the first one is used: - tr/AAA/XYZ/ + tr/AAA/XYZ/ will transliterate any A to X. @@ -2567,10 +2616,10 @@ the I<SEARCHLIST> nor the I<REPLACEMENTLIST> are subjected to double quote interpolation. That means that if you want to use variables, you must use an C<eval()>: - eval "tr/$oldlist/$newlist/"; - die $@ if $@; + eval "tr/$oldlist/$newlist/"; + die $@ if $@; - eval "tr/$oldlist/$newlist/, 1" or die $@; + eval "tr/$oldlist/$newlist/, 1" or die $@; =item C<< <<I<EOF> >> X<here-doc> X<heredoc> X<here-document> X<<< << >>> diff --git a/regen/ebcdic.pl b/regen/ebcdic.pl index 0e40b13204..cfb4d4ea07 100644 --- a/regen/ebcdic.pl +++ b/regen/ebcdic.pl @@ -51,6 +51,22 @@ sub get_column_headers ($$;$) { return $header . "*/\n"; } +sub output_table_start($$$) { + my ($out_fh, $TYPE, $name) = @_; + + my $declaration = "EXTCONST $TYPE $name\[\]"; + print $out_fh <<EOF; +# ifndef DOINIT + $declaration; +# else + $declaration = { +EOF +} + +sub output_table_end($) { + print $out_fh "};\n# endif\n\n"; +} + sub output_table ($$;$) { my $table_ref = shift; my $name = shift; @@ -124,13 +140,7 @@ EOF my $TYPE = 'U8'; $TYPE = 'U16' if grep { $_ > 255 } @$table_ref; - my $declaration = "EXTCONST $TYPE $name\[\]"; - print $out_fh <<EOF; -# ifndef DOINIT -# $declaration; -# else -# $declaration = { -EOF + output_table_start $out_fh, $TYPE, $name; # First the headers for the columns print $out_fh get_column_headers($row_hdr_length, $field_width); @@ -192,7 +202,7 @@ EOF print $out_fh get_column_headers($row_hdr_length, $field_width, ($is_dfa) ? $columns_after_256 : undef); - print $out_fh "};\n# endif\n\n"; + output_table_end($out_fh); } print $out_fh <<'END'; -- Perl5 Master Repository
