diff options
Diffstat (limited to 'upstream/archlinux/man1/perlunicook.1perl')
-rw-r--r-- | upstream/archlinux/man1/perlunicook.1perl | 968 |
1 files changed, 968 insertions, 0 deletions
diff --git a/upstream/archlinux/man1/perlunicook.1perl b/upstream/archlinux/man1/perlunicook.1perl new file mode 100644 index 00000000..589062a3 --- /dev/null +++ b/upstream/archlinux/man1/perlunicook.1perl @@ -0,0 +1,968 @@ +.\" -*- mode: troff; coding: utf-8 -*- +.\" Automatically generated by Pod::Man 5.01 (Pod::Simple 3.43) +.\" +.\" Standard preamble: +.\" ======================================================================== +.de Sp \" Vertical space (when we can't use .PP) +.if t .sp .5v +.if n .sp +.. +.de Vb \" Begin verbatim text +.ft CW +.nf +.ne \\$1 +.. +.de Ve \" End verbatim text +.ft R +.fi +.. +.\" \*(C` and \*(C' are quotes in nroff, nothing in troff, for use with C<>. +.ie n \{\ +. ds C` "" +. ds C' "" +'br\} +.el\{\ +. ds C` +. ds C' +'br\} +.\" +.\" Escape single quotes in literal strings from groff's Unicode transform. +.ie \n(.g .ds Aq \(aq +.el .ds Aq ' +.\" +.\" If the F register is >0, we'll generate index entries on stderr for +.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index +.\" entries marked with X<> in POD. Of course, you'll have to process the +.\" output yourself in some meaningful fashion. +.\" +.\" Avoid warning from groff about undefined register 'F'. +.de IX +.. +.nr rF 0 +.if \n(.g .if rF .nr rF 1 +.if (\n(rF:(\n(.g==0)) \{\ +. if \nF \{\ +. de IX +. tm Index:\\$1\t\\n%\t"\\$2" +.. +. if !\nF==2 \{\ +. nr % 0 +. nr F 2 +. \} +. \} +.\} +.rr rF +.\" ======================================================================== +.\" +.IX Title "PERLUNICOOK 1perl" +.TH PERLUNICOOK 1perl 2024-02-11 "perl v5.38.2" "Perl Programmers Reference Guide" +.\" For nroff, turn off justification. Always turn off hyphenation; it makes +.\" way too many mistakes in technical documents. +.if n .ad l +.nh +.SH NAME +perlunicook \- cookbookish examples of handling Unicode in Perl +.SH DESCRIPTION +.IX Header "DESCRIPTION" +This manpage contains short recipes demonstrating how to handle common Unicode +operations in Perl, plus one complete program at the end. Any undeclared +variables in individual recipes are assumed to have a previous appropriate +value in them. +.SH EXAMPLES +.IX Header "EXAMPLES" +.SS "℞ 0: Standard preamble" +.IX Subsection "℞ 0: Standard preamble" +Unless otherwise notes, all examples below require this standard preamble +to work correctly, with the \f(CW\*(C`#!\*(C'\fR adjusted to work on your system: +.PP +.Vb 1 +\& #!/usr/bin/env perl +\& +\& use v5.36; # or later to get "unicode_strings" feature, +\& # plus strict, warnings +\& use utf8; # so literals and identifiers can be in UTF\-8 +\& use warnings qw(FATAL utf8); # fatalize encoding glitches +\& use open qw(:std :encoding(UTF\-8)); # undeclared streams in UTF\-8 +\& use charnames qw(:full :short); # unneeded in v5.16 +.Ve +.PP +This \fIdoes\fR make even Unix programmers \f(CW\*(C`binmode\*(C'\fR your binary streams, +or open them with \f(CW\*(C`:raw\*(C'\fR, but that's the only way to get at them +portably anyway. +.PP +\&\fBWARNING\fR: \f(CW\*(C`use autodie\*(C'\fR (pre 2.26) and \f(CW\*(C`use open\*(C'\fR do not get along with each +other. +.SS "℞ 1: Generic Unicode-savvy filter" +.IX Subsection "℞ 1: Generic Unicode-savvy filter" +Always decompose on the way in, then recompose on the way out. +.PP +.Vb 1 +\& use Unicode::Normalize; +\& +\& while (<>) { +\& $_ = NFD($_); # decompose + reorder canonically +\& ... +\& } continue { +\& print NFC($_); # recompose (where possible) + reorder canonically +\& } +.Ve +.SS "℞ 2: Fine-tuning Unicode warnings" +.IX Subsection "℞ 2: Fine-tuning Unicode warnings" +As of v5.14, Perl distinguishes three subclasses of UTF‑8 warnings. +.PP +.Vb 4 +\& use v5.14; # subwarnings unavailable any earlier +\& no warnings "nonchar"; # the 66 forbidden non\-characters +\& no warnings "surrogate"; # UTF\-16/CESU\-8 nonsense +\& no warnings "non_unicode"; # for codepoints over 0x10_FFFF +.Ve +.SS "℞ 3: Declare source in utf8 for identifiers and literals" +.IX Subsection "℞ 3: Declare source in utf8 for identifiers and literals" +Without the all-critical \f(CW\*(C`use utf8\*(C'\fR declaration, putting UTF‑8 in your +literals and identifiers won’t work right. If you used the standard +preamble just given above, this already happened. If you did, you can +do things like this: +.PP +.Vb 1 +\& use utf8; +\& +\& my $measure = "Ångström"; +\& my @μsoft = qw( cp852 cp1251 cp1252 ); +\& my @ὑπέρμεγας = qw( ὑπέρ μεγας ); +\& my @鯉 = qw( koi8\-f koi8\-u koi8\-r ); +\& my $motto = "👪 💗 🐪"; # FAMILY, GROWING HEART, DROMEDARY CAMEL +.Ve +.PP +If you forget \f(CW\*(C`use utf8\*(C'\fR, high bytes will be misunderstood as +separate characters, and nothing will work right. +.SS "℞ 4: Characters and their numbers" +.IX Subsection "℞ 4: Characters and their numbers" +The \f(CW\*(C`ord\*(C'\fR and \f(CW\*(C`chr\*(C'\fR functions work transparently on all codepoints, +not just on ASCII alone — nor in fact, not even just on Unicode alone. +.PP +.Vb 3 +\& # ASCII characters +\& ord("A") +\& chr(65) +\& +\& # characters from the Basic Multilingual Plane +\& ord("Σ") +\& chr(0x3A3) +\& +\& # beyond the BMP +\& ord("𝑛") # MATHEMATICAL ITALIC SMALL N +\& chr(0x1D45B) +\& +\& # beyond Unicode! (up to MAXINT) +\& ord("\ex{20_0000}") +\& chr(0x20_0000) +.Ve +.SS "℞ 5: Unicode literals by character number" +.IX Subsection "℞ 5: Unicode literals by character number" +In an interpolated literal, whether a double-quoted string or a +regex, you may specify a character by its number using the +\&\f(CW\*(C`\ex{\fR\f(CIHHHHHH\fR\f(CW}\*(C'\fR escape. +.PP +.Vb 2 +\& String: "\ex{3a3}" +\& Regex: /\ex{3a3}/ +\& +\& String: "\ex{1d45b}" +\& Regex: /\ex{1d45b}/ +\& +\& # even non\-BMP ranges in regex work fine +\& /[\ex{1D434}\-\ex{1D467}]/ +.Ve +.SS "℞ 6: Get character name by number" +.IX Subsection "℞ 6: Get character name by number" +.Vb 2 +\& use charnames (); +\& my $name = charnames::viacode(0x03A3); +.Ve +.SS "℞ 7: Get character number by name" +.IX Subsection "℞ 7: Get character number by name" +.Vb 2 +\& use charnames (); +\& my $number = charnames::vianame("GREEK CAPITAL LETTER SIGMA"); +.Ve +.SS "℞ 8: Unicode named characters" +.IX Subsection "℞ 8: Unicode named characters" +Use the \f(CW\*(C`\eN{\fR\f(CIcharname\fR\f(CW}\*(C'\fR notation to get the character +by that name for use in interpolated literals (double-quoted +strings and regexes). In v5.16, there is an implicit +.PP +.Vb 1 +\& use charnames qw(:full :short); +.Ve +.PP +But prior to v5.16, you must be explicit about which set of charnames you +want. The \f(CW\*(C`:full\*(C'\fR names are the official Unicode character name, alias, or +sequence, which all share a namespace. +.PP +.Vb 1 +\& use charnames qw(:full :short latin greek); +\& +\& "\eN{MATHEMATICAL ITALIC SMALL N}" # :full +\& "\eN{GREEK CAPITAL LETTER SIGMA}" # :full +.Ve +.PP +Anything else is a Perl-specific convenience abbreviation. Specify one or +more scripts by names if you want short names that are script-specific. +.PP +.Vb 3 +\& "\eN{Greek:Sigma}" # :short +\& "\eN{ae}" # latin +\& "\eN{epsilon}" # greek +.Ve +.PP +The v5.16 release also supports a \f(CW\*(C`:loose\*(C'\fR import for loose matching of +character names, which works just like loose matching of property names: +that is, it disregards case, whitespace, and underscores: +.PP +.Vb 1 +\& "\eN{euro sign}" # :loose (from v5.16) +.Ve +.PP +Starting in v5.32, you can also use +.PP +.Vb 1 +\& qr/\ep{name=euro sign}/ +.Ve +.PP +to get official Unicode named characters in regular expressions. Loose +matching is always done for these. +.SS "℞ 9: Unicode named sequences" +.IX Subsection "℞ 9: Unicode named sequences" +These look just like character names but return multiple codepoints. +Notice the \f(CW%vx\fR vector-print functionality in \f(CW\*(C`printf\*(C'\fR. +.PP +.Vb 4 +\& use charnames qw(:full); +\& my $seq = "\eN{LATIN CAPITAL LETTER A WITH MACRON AND GRAVE}"; +\& printf "U+%v04X\en", $seq; +\& U+0100.0300 +.Ve +.SS "℞ 10: Custom named characters" +.IX Subsection "℞ 10: Custom named characters" +Use \f(CW\*(C`:alias\*(C'\fR to give your own lexically scoped nicknames to existing +characters, or even to give unnamed private-use characters useful names. +.PP +.Vb 4 +\& use charnames ":full", ":alias" => { +\& ecute => "LATIN SMALL LETTER E WITH ACUTE", +\& "APPLE LOGO" => 0xF8FF, # private use character +\& }; +\& +\& "\eN{ecute}" +\& "\eN{APPLE LOGO}" +.Ve +.SS "℞ 11: Names of CJK codepoints" +.IX Subsection "℞ 11: Names of CJK codepoints" +Sinograms like “東京” come back with character names of +\&\f(CW\*(C`CJK UNIFIED IDEOGRAPH\-6771\*(C'\fR and \f(CW\*(C`CJK UNIFIED IDEOGRAPH\-4EAC\*(C'\fR, +because their “names” vary. The CPAN \f(CW\*(C`Unicode::Unihan\*(C'\fR module +has a large database for decoding these (and a whole lot more), provided you +know how to understand its output. +.PP +.Vb 8 +\& # cpan \-i Unicode::Unihan +\& use Unicode::Unihan; +\& my $str = "東京"; +\& my $unhan = Unicode::Unihan\->new; +\& for my $lang (qw(Mandarin Cantonese Korean JapaneseOn JapaneseKun)) { +\& printf "CJK $str in %\-12s is ", $lang; +\& say $unhan\->$lang($str); +\& } +.Ve +.PP +prints: +.PP +.Vb 5 +\& CJK 東京 in Mandarin is DONG1JING1 +\& CJK 東京 in Cantonese is dung1ging1 +\& CJK 東京 in Korean is TONGKYENG +\& CJK 東京 in JapaneseOn is TOUKYOU KEI KIN +\& CJK 東京 in JapaneseKun is HIGASHI AZUMAMIYAKO +.Ve +.PP +If you have a specific romanization scheme in mind, +use the specific module: +.PP +.Vb 5 +\& # cpan \-i Lingua::JA::Romanize::Japanese +\& use Lingua::JA::Romanize::Japanese; +\& my $k2r = Lingua::JA::Romanize::Japanese\->new; +\& my $str = "東京"; +\& say "Japanese for $str is ", $k2r\->chars($str); +.Ve +.PP +prints +.PP +.Vb 1 +\& Japanese for 東京 is toukyou +.Ve +.SS "℞ 12: Explicit encode/decode" +.IX Subsection "℞ 12: Explicit encode/decode" +On rare occasion, such as a database read, you may be +given encoded text you need to decode. +.PP +.Vb 1 +\& use Encode qw(encode decode); +\& +\& my $chars = decode("shiftjis", $bytes, 1); +\& # OR +\& my $bytes = encode("MIME\-Header\-ISO_2022_JP", $chars, 1); +.Ve +.PP +For streams all in the same encoding, don't use encode/decode; instead +set the file encoding when you open the file or immediately after with +\&\f(CW\*(C`binmode\*(C'\fR as described later below. +.SS "℞ 13: Decode program arguments as utf8" +.IX Subsection "℞ 13: Decode program arguments as utf8" +.Vb 6 +\& $ perl \-CA ... +\& or +\& $ export PERL_UNICODE=A +\& or +\& use Encode qw(decode); +\& @ARGV = map { decode(\*(AqUTF\-8\*(Aq, $_, 1) } @ARGV; +.Ve +.SS "℞ 14: Decode program arguments as locale encoding" +.IX Subsection "℞ 14: Decode program arguments as locale encoding" +.Vb 3 +\& # cpan \-i Encode::Locale +\& use Encode qw(locale); +\& use Encode::Locale; +\& +\& # use "locale" as an arg to encode/decode +\& @ARGV = map { decode(locale => $_, 1) } @ARGV; +.Ve +.SS "℞ 15: Declare STD{IN,OUT,ERR} to be utf8" +.IX Subsection "℞ 15: Declare STD{IN,OUT,ERR} to be utf8" +Use a command-line option, an environment variable, or else +call \f(CW\*(C`binmode\*(C'\fR explicitly: +.PP +.Vb 9 +\& $ perl \-CS ... +\& or +\& $ export PERL_UNICODE=S +\& or +\& use open qw(:std :encoding(UTF\-8)); +\& or +\& binmode(STDIN, ":encoding(UTF\-8)"); +\& binmode(STDOUT, ":utf8"); +\& binmode(STDERR, ":utf8"); +.Ve +.SS "℞ 16: Declare STD{IN,OUT,ERR} to be in locale encoding" +.IX Subsection "℞ 16: Declare STD{IN,OUT,ERR} to be in locale encoding" +.Vb 3 +\& # cpan \-i Encode::Locale +\& use Encode; +\& use Encode::Locale; +\& +\& # or as a stream for binmode or open +\& binmode STDIN, ":encoding(console_in)" if \-t STDIN; +\& binmode STDOUT, ":encoding(console_out)" if \-t STDOUT; +\& binmode STDERR, ":encoding(console_out)" if \-t STDERR; +.Ve +.SS "℞ 17: Make file I/O default to utf8" +.IX Subsection "℞ 17: Make file I/O default to utf8" +Files opened without an encoding argument will be in UTF\-8: +.PP +.Vb 5 +\& $ perl \-CD ... +\& or +\& $ export PERL_UNICODE=D +\& or +\& use open qw(:encoding(UTF\-8)); +.Ve +.SS "℞ 18: Make all I/O and args default to utf8" +.IX Subsection "℞ 18: Make all I/O and args default to utf8" +.Vb 7 +\& $ perl \-CSDA ... +\& or +\& $ export PERL_UNICODE=SDA +\& or +\& use open qw(:std :encoding(UTF\-8)); +\& use Encode qw(decode); +\& @ARGV = map { decode(\*(AqUTF\-8\*(Aq, $_, 1) } @ARGV; +.Ve +.SS "℞ 19: Open file with specific encoding" +.IX Subsection "℞ 19: Open file with specific encoding" +Specify stream encoding. This is the normal way +to deal with encoded text, not by calling low-level +functions. +.PP +.Vb 7 +\& # input file +\& open(my $in_file, "< :encoding(UTF\-16)", "wintext"); +\& OR +\& open(my $in_file, "<", "wintext"); +\& binmode($in_file, ":encoding(UTF\-16)"); +\& THEN +\& my $line = <$in_file>; +\& +\& # output file +\& open($out_file, "> :encoding(cp1252)", "wintext"); +\& OR +\& open(my $out_file, ">", "wintext"); +\& binmode($out_file, ":encoding(cp1252)"); +\& THEN +\& print $out_file "some text\en"; +.Ve +.PP +More layers than just the encoding can be specified here. For example, +the incantation \f(CW":raw :encoding(UTF\-16LE) :crlf"\fR includes implicit +CRLF handling. +.SS "℞ 20: Unicode casing" +.IX Subsection "℞ 20: Unicode casing" +Unicode casing is very different from ASCII casing. +.PP +.Vb 2 +\& uc("henry ⅷ") # "HENRY Ⅷ" +\& uc("tschüß") # "TSCHÜSS" notice ß => SS +\& +\& # both are true: +\& "tschüß" =~ /TSCHÜSS/i # notice ß => SS +\& "Σίσυφος" =~ /ΣΊΣΥΦΟΣ/i # notice Σ,σ,ς sameness +.Ve +.SS "℞ 21: Unicode case-insensitive comparisons" +.IX Subsection "℞ 21: Unicode case-insensitive comparisons" +Also available in the CPAN Unicode::CaseFold module, +the new \f(CW\*(C`fc\*(C'\fR “foldcase” function from v5.16 grants +access to the same Unicode casefolding as the \f(CW\*(C`/i\*(C'\fR +pattern modifier has always used: +.PP +.Vb 1 +\& use feature "fc"; # fc() function is from v5.16 +\& +\& # sort case\-insensitively +\& my @sorted = sort { fc($a) cmp fc($b) } @list; +\& +\& # both are true: +\& fc("tschüß") eq fc("TSCHÜSS") +\& fc("Σίσυφος") eq fc("ΣΊΣΥΦΟΣ") +.Ve +.SS "℞ 22: Match Unicode linebreak sequence in regex" +.IX Subsection "℞ 22: Match Unicode linebreak sequence in regex" +A Unicode linebreak matches the two-character CRLF +grapheme or any of seven vertical whitespace characters. +Good for dealing with textfiles coming from different +operating systems. +.PP +.Vb 1 +\& \eR +\& +\& s/\eR/\en/g; # normalize all linebreaks to \en +.Ve +.SS "℞ 23: Get character category" +.IX Subsection "℞ 23: Get character category" +Find the general category of a numeric codepoint. +.PP +.Vb 2 +\& use Unicode::UCD qw(charinfo); +\& my $cat = charinfo(0x3A3)\->{category}; # "Lu" +.Ve +.SS "℞ 24: Disabling Unicode-awareness in builtin charclasses" +.IX Subsection "℞ 24: Disabling Unicode-awareness in builtin charclasses" +Disable \f(CW\*(C`\ew\*(C'\fR, \f(CW\*(C`\eb\*(C'\fR, \f(CW\*(C`\es\*(C'\fR, \f(CW\*(C`\ed\*(C'\fR, and the POSIX +classes from working correctly on Unicode either in this +scope, or in just one regex. +.PP +.Vb 2 +\& use v5.14; +\& use re "/a"; +\& +\& # OR +\& +\& my($num) = $str =~ /(\ed+)/a; +.Ve +.PP +Or use specific un-Unicode properties, like \f(CW\*(C`\ep{ahex}\*(C'\fR +and \f(CW\*(C`\ep{POSIX_Digit\*(C'\fR}. Properties still work normally +no matter what charset modifiers (\f(CW\*(C`/d /u /l /a /aa\*(C'\fR) +should be effect. +.SS "℞ 25: Match Unicode properties in regex with \ep, \eP" +.IX Subsection "℞ 25: Match Unicode properties in regex with p, P" +These all match a single codepoint with the given +property. Use \f(CW\*(C`\eP\*(C'\fR in place of \f(CW\*(C`\ep\*(C'\fR to match +one codepoint lacking that property. +.PP +.Vb 8 +\& \epL, \epN, \epS, \epP, \epM, \epZ, \epC +\& \ep{Sk}, \ep{Ps}, \ep{Lt} +\& \ep{alpha}, \ep{upper}, \ep{lower} +\& \ep{Latin}, \ep{Greek} +\& \ep{script_extensions=Latin}, \ep{scx=Greek} +\& \ep{East_Asian_Width=Wide}, \ep{EA=W} +\& \ep{Line_Break=Hyphen}, \ep{LB=HY} +\& \ep{Numeric_Value=4}, \ep{NV=4} +.Ve +.SS "℞ 26: Custom character properties" +.IX Subsection "℞ 26: Custom character properties" +Define at compile-time your own custom character +properties for use in regexes. +.PP +.Vb 2 +\& # using private\-use characters +\& sub In_Tengwar { "E000\etE07F\en" } +\& +\& if (/\ep{In_Tengwar}/) { ... } +\& +\& # blending existing properties +\& sub Is_GraecoRoman_Title {<<\*(AqEND_OF_SET\*(Aq} +\& +utf8::IsLatin +\& +utf8::IsGreek +\& &utf8::IsTitle +\& END_OF_SET +\& +\& if (/\ep{Is_GraecoRoman_Title}/ { ... } +.Ve +.SS "℞ 27: Unicode normalization" +.IX Subsection "℞ 27: Unicode normalization" +Typically render into NFD on input and NFC on output. Using NFKC or NFKD +functions improves recall on searches, assuming you've already done to the +same text to be searched. Note that this is about much more than just pre\- +combined compatibility glyphs; it also reorders marks according to their +canonical combining classes and weeds out singletons. +.PP +.Vb 5 +\& use Unicode::Normalize; +\& my $nfd = NFD($orig); +\& my $nfc = NFC($orig); +\& my $nfkd = NFKD($orig); +\& my $nfkc = NFKC($orig); +.Ve +.SS "℞ 28: Convert non-ASCII Unicode numerics" +.IX Subsection "℞ 28: Convert non-ASCII Unicode numerics" +Unless you’ve used \f(CW\*(C`/a\*(C'\fR or \f(CW\*(C`/aa\*(C'\fR, \f(CW\*(C`\ed\*(C'\fR matches more than +ASCII digits only, but Perl’s implicit string-to-number +conversion does not current recognize these. Here’s how to +convert such strings manually. +.PP +.Vb 8 +\& use v5.14; # needed for num() function +\& use Unicode::UCD qw(num); +\& my $str = "got Ⅻ and ४५६७ and ⅞ and here"; +\& my @nums = (); +\& while ($str =~ /(\ed+|\eN)/g) { # not just ASCII! +\& push @nums, num($1); +\& } +\& say "@nums"; # 12 4567 0.875 +\& +\& use charnames qw(:full); +\& my $nv = num("\eN{RUMI DIGIT ONE}\eN{RUMI DIGIT TWO}"); +.Ve +.SS "℞ 29: Match Unicode grapheme cluster in regex" +.IX Subsection "℞ 29: Match Unicode grapheme cluster in regex" +Programmer-visible “characters” are codepoints matched by \f(CW\*(C`/./s\*(C'\fR, +but user-visible “characters” are graphemes matched by \f(CW\*(C`/\eX/\*(C'\fR. +.PP +.Vb 3 +\& # Find vowel *plus* any combining diacritics,underlining,etc. +\& my $nfd = NFD($orig); +\& $nfd =~ / (?=[aeiou]) \eX /xi +.Ve +.SS "℞ 30: Extract by grapheme instead of by codepoint (regex)" +.IX Subsection "℞ 30: Extract by grapheme instead of by codepoint (regex)" +.Vb 2 +\& # match and grab five first graphemes +\& my($first_five) = $str =~ /^ ( \eX{5} ) /x; +.Ve +.SS "℞ 31: Extract by grapheme instead of by codepoint (substr)" +.IX Subsection "℞ 31: Extract by grapheme instead of by codepoint (substr)" +.Vb 4 +\& # cpan \-i Unicode::GCString +\& use Unicode::GCString; +\& my $gcs = Unicode::GCString\->new($str); +\& my $first_five = $gcs\->substr(0, 5); +.Ve +.SS "℞ 32: Reverse string by grapheme" +.IX Subsection "℞ 32: Reverse string by grapheme" +Reversing by codepoint messes up diacritics, mistakenly converting +\&\f(CW\*(C`crème brûlée\*(C'\fR into \f(CW\*(C`éel̂urb em̀erc\*(C'\fR instead of into \f(CW\*(C`eélûrb emèrc\*(C'\fR; +so reverse by grapheme instead. Both these approaches work +right no matter what normalization the string is in: +.PP +.Vb 1 +\& $str = join("", reverse $str =~ /\eX/g); +\& +\& # OR: cpan \-i Unicode::GCString +\& use Unicode::GCString; +\& $str = reverse Unicode::GCString\->new($str); +.Ve +.SS "℞ 33: String length in graphemes" +.IX Subsection "℞ 33: String length in graphemes" +The string \f(CW\*(C`brûlée\*(C'\fR has six graphemes but up to eight codepoints. +This counts by grapheme, not by codepoint: +.PP +.Vb 3 +\& my $str = "brûlée"; +\& my $count = 0; +\& while ($str =~ /\eX/g) { $count++ } +\& +\& # OR: cpan \-i Unicode::GCString +\& use Unicode::GCString; +\& my $gcs = Unicode::GCString\->new($str); +\& my $count = $gcs\->length; +.Ve +.SS "℞ 34: Unicode column-width for printing" +.IX Subsection "℞ 34: Unicode column-width for printing" +Perl’s \f(CW\*(C`printf\*(C'\fR, \f(CW\*(C`sprintf\*(C'\fR, and \f(CW\*(C`format\*(C'\fR think all +codepoints take up 1 print column, but many take 0 or 2. +Here to show that normalization makes no difference, +we print out both forms: +.PP +.Vb 2 +\& use Unicode::GCString; +\& use Unicode::Normalize; +\& +\& my @words = qw/crème brûlée/; +\& @words = map { NFC($_), NFD($_) } @words; +\& +\& for my $str (@words) { +\& my $gcs = Unicode::GCString\->new($str); +\& my $cols = $gcs\->columns; +\& my $pad = " " x (10 \- $cols); +\& say str, $pad, " |"; +\& } +.Ve +.PP +generates this to show that it pads correctly no matter +the normalization: +.PP +.Vb 4 +\& crème | +\& crème | +\& brûlée | +\& brûlée | +.Ve +.SS "℞ 35: Unicode collation" +.IX Subsection "℞ 35: Unicode collation" +Text sorted by numeric codepoint follows no reasonable alphabetic order; +use the UCA for sorting text. +.PP +.Vb 3 +\& use Unicode::Collate; +\& my $col = Unicode::Collate\->new(); +\& my @list = $col\->sort(@old_list); +.Ve +.PP +See the \fIucsort\fR program from the Unicode::Tussle CPAN module +for a convenient command-line interface to this module. +.SS "℞ 36: Case\- \fIand\fP accent-insensitive Unicode sort" +.IX Subsection "℞ 36: Case- and accent-insensitive Unicode sort" +Specify a collation strength of level 1 to ignore case and +diacritics, only looking at the basic character. +.PP +.Vb 3 +\& use Unicode::Collate; +\& my $col = Unicode::Collate\->new(level => 1); +\& my @list = $col\->sort(@old_list); +.Ve +.SS "℞ 37: Unicode locale collation" +.IX Subsection "℞ 37: Unicode locale collation" +Some locales have special sorting rules. +.PP +.Vb 4 +\& # either use v5.12, OR: cpan \-i Unicode::Collate::Locale +\& use Unicode::Collate::Locale; +\& my $col = Unicode::Collate::Locale\->new(locale => "de_\|_phonebook"); +\& my @list = $col\->sort(@old_list); +.Ve +.PP +The \fIucsort\fR program mentioned above accepts a \f(CW\*(C`\-\-locale\*(C'\fR parameter. +.ie n .SS "℞ 38: Making ""cmp"" work on text instead of codepoints" +.el .SS "℞ 38: Making \f(CWcmp\fP work on text instead of codepoints" +.IX Subsection "℞ 38: Making cmp work on text instead of codepoints" +Instead of this: +.PP +.Vb 5 +\& @srecs = sort { +\& $b\->{AGE} <=> $a\->{AGE} +\& || +\& $a\->{NAME} cmp $b\->{NAME} +\& } @recs; +.Ve +.PP +Use this: +.PP +.Vb 9 +\& my $coll = Unicode::Collate\->new(); +\& for my $rec (@recs) { +\& $rec\->{NAME_key} = $coll\->getSortKey( $rec\->{NAME} ); +\& } +\& @srecs = sort { +\& $b\->{AGE} <=> $a\->{AGE} +\& || +\& $a\->{NAME_key} cmp $b\->{NAME_key} +\& } @recs; +.Ve +.SS "℞ 39: Case\- \fIand\fP accent-insensitive comparisons" +.IX Subsection "℞ 39: Case- and accent-insensitive comparisons" +Use a collator object to compare Unicode text by character +instead of by codepoint. +.PP +.Vb 5 +\& use Unicode::Collate; +\& my $es = Unicode::Collate\->new( +\& level => 1, +\& normalization => undef +\& ); +\& +\& # now both are true: +\& $es\->eq("García", "GARCIA" ); +\& $es\->eq("Márquez", "MARQUEZ"); +.Ve +.SS "℞ 40: Case\- \fIand\fP accent-insensitive locale comparisons" +.IX Subsection "℞ 40: Case- and accent-insensitive locale comparisons" +Same, but in a specific locale. +.PP +.Vb 3 +\& my $de = Unicode::Collate::Locale\->new( +\& locale => "de_\|_phonebook", +\& ); +\& +\& # now this is true: +\& $de\->eq("tschüß", "TSCHUESS"); # notice ü => UE, ß => SS +.Ve +.SS "℞ 41: Unicode linebreaking" +.IX Subsection "℞ 41: Unicode linebreaking" +Break up text into lines according to Unicode rules. +.PP +.Vb 3 +\& # cpan \-i Unicode::LineBreak +\& use Unicode::LineBreak; +\& use charnames qw(:full); +\& +\& my $para = "This is a super\eN{HYPHEN}long string. " x 20; +\& my $fmt = Unicode::LineBreak\->new; +\& print $fmt\->break($para), "\en"; +.Ve +.SS "℞ 42: Unicode text in DBM hashes, the tedious way" +.IX Subsection "℞ 42: Unicode text in DBM hashes, the tedious way" +Using a regular Perl string as a key or value for a DBM +hash will trigger a wide character exception if any codepoints +won’t fit into a byte. Here’s how to manually manage the translation: +.PP +.Vb 3 +\& use DB_File; +\& use Encode qw(encode decode); +\& tie %dbhash, "DB_File", "pathname"; +\& +\& # STORE +\& +\& # assume $uni_key and $uni_value are abstract Unicode strings +\& my $enc_key = encode("UTF\-8", $uni_key, 1); +\& my $enc_value = encode("UTF\-8", $uni_value, 1); +\& $dbhash{$enc_key} = $enc_value; +\& +\& # FETCH +\& +\& # assume $uni_key holds a normal Perl string (abstract Unicode) +\& my $enc_key = encode("UTF\-8", $uni_key, 1); +\& my $enc_value = $dbhash{$enc_key}; +\& my $uni_value = decode("UTF\-8", $enc_value, 1); +.Ve +.SS "℞ 43: Unicode text in DBM hashes, the easy way" +.IX Subsection "℞ 43: Unicode text in DBM hashes, the easy way" +Here’s how to implicitly manage the translation; all encoding +and decoding is done automatically, just as with streams that +have a particular encoding attached to them: +.PP +.Vb 2 +\& use DB_File; +\& use DBM_Filter; +\& +\& my $dbobj = tie %dbhash, "DB_File", "pathname"; +\& $dbobj\->Filter_Value("utf8"); # this is the magic bit +\& +\& # STORE +\& +\& # assume $uni_key and $uni_value are abstract Unicode strings +\& $dbhash{$uni_key} = $uni_value; +\& +\& # FETCH +\& +\& # $uni_key holds a normal Perl string (abstract Unicode) +\& my $uni_value = $dbhash{$uni_key}; +.Ve +.SS "℞ 44: PROGRAM: Demo of Unicode collation and printing" +.IX Subsection "℞ 44: PROGRAM: Demo of Unicode collation and printing" +Here’s a full program showing how to make use of locale-sensitive +sorting, Unicode casing, and managing print widths when some of the +characters take up zero or two columns, not just one column each time. +When run, the following program produces this nicely aligned output: +.PP +.Vb 10 +\& Crème Brûlée....... €2.00 +\& Éclair............. €1.60 +\& Fideuà............. €4.20 +\& Hamburger.......... €6.00 +\& Jamón Serrano...... €4.45 +\& Linguiça........... €7.00 +\& Pâté............... €4.15 +\& Pears.............. €2.00 +\& Pêches............. €2.25 +\& Smørbrød........... €5.75 +\& Spätzle............ €5.50 +\& Xoriço............. €3.00 +\& Γύρος.............. €6.50 +\& 막걸리............. €4.00 +\& おもち............. €2.65 +\& お好み焼き......... €8.00 +\& シュークリーム..... €1.85 +\& 寿司............... €9.99 +\& 包子............... €7.50 +.Ve +.PP +Here's that program. +.PP +.Vb 10 +\& #!/usr/bin/env perl +\& # umenu \- demo sorting and printing of Unicode food +\& # +\& # (obligatory and increasingly long preamble) +\& # +\& use v5.36; +\& use utf8; +\& use warnings qw(FATAL utf8); # fatalize encoding faults +\& use open qw(:std :encoding(UTF\-8)); # undeclared streams in UTF\-8 +\& use charnames qw(:full :short); # unneeded in v5.16 +\& +\& # std modules +\& use Unicode::Normalize; # std perl distro as of v5.8 +\& use List::Util qw(max); # std perl distro as of v5.10 +\& use Unicode::Collate::Locale; # std perl distro as of v5.14 +\& +\& # cpan modules +\& use Unicode::GCString; # from CPAN +\& +\& my %price = ( +\& "γύρος" => 6.50, # gyros +\& "pears" => 2.00, # like um, pears +\& "linguiça" => 7.00, # spicy sausage, Portuguese +\& "xoriço" => 3.00, # chorizo sausage, Catalan +\& "hamburger" => 6.00, # burgermeister meisterburger +\& "éclair" => 1.60, # dessert, French +\& "smørbrød" => 5.75, # sandwiches, Norwegian +\& "spätzle" => 5.50, # Bayerisch noodles, little sparrows +\& "包子" => 7.50, # bao1 zi5, steamed pork buns, Mandarin +\& "jamón serrano" => 4.45, # country ham, Spanish +\& "pêches" => 2.25, # peaches, French +\& "シュークリーム" => 1.85, # cream\-filled pastry like eclair +\& "막걸리" => 4.00, # makgeolli, Korean rice wine +\& "寿司" => 9.99, # sushi, Japanese +\& "おもち" => 2.65, # omochi, rice cakes, Japanese +\& "crème brûlée" => 2.00, # crema catalana +\& "fideuà" => 4.20, # more noodles, Valencian +\& # (Catalan=fideuada) +\& "pâté" => 4.15, # gooseliver paste, French +\& "お好み焼き" => 8.00, # okonomiyaki, Japanese +\& ); +\& +\& my $width = 5 + max map { colwidth($_) } keys %price; +\& +\& # So the Asian stuff comes out in an order that someone +\& # who reads those scripts won\*(Aqt freak out over; the +\& # CJK stuff will be in JIS X 0208 order that way. +\& my $coll = Unicode::Collate::Locale\->new(locale => "ja"); +\& +\& for my $item ($coll\->sort(keys %price)) { +\& print pad(entitle($item), $width, "."); +\& printf " €%.2f\en", $price{$item}; +\& } +\& +\& sub pad ($str, $width, $padchar) { +\& return $str . ($padchar x ($width \- colwidth($str))); +\& } +\& +\& sub colwidth ($str) { +\& return Unicode::GCString\->new($str)\->columns; +\& } +\& +\& sub entitle ($str) { +\& $str =~ s{ (?=\epL)(\eS) (\eS*) } +\& { ucfirst($1) . lc($2) }xge; +\& return $str; +\& } +.Ve +.SH "SEE ALSO" +.IX Header "SEE ALSO" +See these manpages, some of which are CPAN modules: +perlunicode, perluniprops, +perlre, perlrecharclass, +perluniintro, perlunitut, perlunifaq, +PerlIO, DB_File, DBM_Filter, DBM_Filter::utf8, +Encode, Encode::Locale, +Unicode::UCD, +Unicode::Normalize, +Unicode::GCString, Unicode::LineBreak, +Unicode::Collate, Unicode::Collate::Locale, +Unicode::Unihan, +Unicode::CaseFold, +Unicode::Tussle, +Lingua::JA::Romanize::Japanese, +Lingua::ZH::Romanize::Pinyin, +Lingua::KO::Romanize::Hangul. +.PP +The Unicode::Tussle CPAN module includes many programs +to help with working with Unicode, including +these programs to fully or partly replace standard utilities: +\&\fItcgrep\fR instead of \fIegrep\fR, +\&\fIuniquote\fR instead of \fIcat \-v\fR or \fIhexdump\fR, +\&\fIuniwc\fR instead of \fIwc\fR, +\&\fIunilook\fR instead of \fIlook\fR, +\&\fIunifmt\fR instead of \fIfmt\fR, +and +\&\fIucsort\fR instead of \fIsort\fR. +For exploring Unicode character names and character properties, +see its \fIuniprops\fR, \fIunichars\fR, and \fIuninames\fR programs. +It also supplies these programs, all of which are general filters that do Unicode-y things: +\&\fIunititle\fR and \fIunicaps\fR; +\&\fIuniwide\fR and \fIuninarrow\fR; +\&\fIunisupers\fR and \fIunisubs\fR; +\&\fInfd\fR, \fInfc\fR, \fInfkd\fR, and \fInfkc\fR; +and \fIuc\fR, \fIlc\fR, and \fItc\fR. +.PP +Finally, see the published Unicode Standard (page numbers are from version +6.0.0), including these specific annexes and technical reports: +.IP "§3.13 Default Case Algorithms, page 113; §4.2 Case, pages 120–122; Case Mappings, page 166–172, especially Caseless Matching starting on page 170." 4 +.IX Item "§3.13 Default Case Algorithms, page 113; §4.2 Case, pages 120–122; Case Mappings, page 166–172, especially Caseless Matching starting on page 170." +.PD 0 +.IP "UAX #44: Unicode Character Database" 4 +.IX Item "UAX #44: Unicode Character Database" +.IP "UTS #18: Unicode Regular Expressions" 4 +.IX Item "UTS #18: Unicode Regular Expressions" +.IP "UAX #15: Unicode Normalization Forms" 4 +.IX Item "UAX #15: Unicode Normalization Forms" +.IP "UTS #10: Unicode Collation Algorithm" 4 +.IX Item "UTS #10: Unicode Collation Algorithm" +.IP "UAX #29: Unicode Text Segmentation" 4 +.IX Item "UAX #29: Unicode Text Segmentation" +.IP "UAX #14: Unicode Line Breaking Algorithm" 4 +.IX Item "UAX #14: Unicode Line Breaking Algorithm" +.IP "UAX #11: East Asian Width" 4 +.IX Item "UAX #11: East Asian Width" +.PD +.SH AUTHOR +.IX Header "AUTHOR" +Tom Christiansen <tchrist@perl.com> wrote this, with occasional +kibbitzing from Larry Wall and Jeffrey Friedl in the background. +.SH "COPYRIGHT AND LICENCE" +.IX Header "COPYRIGHT AND LICENCE" +Copyright © 2012 Tom Christiansen. +.PP +This program is free software; you may redistribute it and/or modify it +under the same terms as Perl itself. +.PP +Most of these examples taken from the current edition of the “Camel Book”; +that is, from the 4ᵗʰ Edition of \fIProgramming Perl\fR, Copyright © 2012 Tom +Christiansen <et al.>, 2012\-02\-13 by O’Reilly Media. The code itself is +freely redistributable, and you are encouraged to transplant, fold, +spindle, and mutilate any of the examples in this manpage however you please +for inclusion into your own programs without any encumbrance whatsoever. +Acknowledgement via code comment is polite but not required. +.SH "REVISION HISTORY" +.IX Header "REVISION HISTORY" +v1.0.0 – first public release, 2012\-02\-27 |