diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-15 19:43:11 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-15 19:43:11 +0000 |
commit | fc22b3d6507c6745911b9dfcc68f1e665ae13dbc (patch) | |
tree | ce1e3bce06471410239a6f41282e328770aa404a /upstream/archlinux/man3/Unicode::UCD.3perl | |
parent | Initial commit. (diff) | |
download | manpages-l10n-fc22b3d6507c6745911b9dfcc68f1e665ae13dbc.tar.xz manpages-l10n-fc22b3d6507c6745911b9dfcc68f1e665ae13dbc.zip |
Adding upstream version 4.22.0.upstream/4.22.0
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'upstream/archlinux/man3/Unicode::UCD.3perl')
-rw-r--r-- | upstream/archlinux/man3/Unicode::UCD.3perl | 1894 |
1 files changed, 1894 insertions, 0 deletions
diff --git a/upstream/archlinux/man3/Unicode::UCD.3perl b/upstream/archlinux/man3/Unicode::UCD.3perl new file mode 100644 index 00000000..fa0484c7 --- /dev/null +++ b/upstream/archlinux/man3/Unicode::UCD.3perl @@ -0,0 +1,1894 @@ +.\" -*- mode: troff; coding: utf-8 -*- +.\" Automatically generated by Pod::Man 5.01 (Pod::Simple 3.43) +.\" +.\" Standard preamble: +.\" ======================================================================== +.de Sp \" Vertical space (when we can't use .PP) +.if t .sp .5v +.if n .sp +.. +.de Vb \" Begin verbatim text +.ft CW +.nf +.ne \\$1 +.. +.de Ve \" End verbatim text +.ft R +.fi +.. +.\" \*(C` and \*(C' are quotes in nroff, nothing in troff, for use with C<>. +.ie n \{\ +. ds C` "" +. ds C' "" +'br\} +.el\{\ +. ds C` +. ds C' +'br\} +.\" +.\" Escape single quotes in literal strings from groff's Unicode transform. +.ie \n(.g .ds Aq \(aq +.el .ds Aq ' +.\" +.\" If the F register is >0, we'll generate index entries on stderr for +.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index +.\" entries marked with X<> in POD. Of course, you'll have to process the +.\" output yourself in some meaningful fashion. +.\" +.\" Avoid warning from groff about undefined register 'F'. +.de IX +.. +.nr rF 0 +.if \n(.g .if rF .nr rF 1 +.if (\n(rF:(\n(.g==0)) \{\ +. if \nF \{\ +. de IX +. tm Index:\\$1\t\\n%\t"\\$2" +.. +. if !\nF==2 \{\ +. nr % 0 +. nr F 2 +. \} +. \} +.\} +.rr rF +.\" ======================================================================== +.\" +.IX Title "Unicode::UCD 3perl" +.TH Unicode::UCD 3perl 2024-02-11 "perl v5.38.2" "Perl Programmers Reference Guide" +.\" For nroff, turn off justification. Always turn off hyphenation; it makes +.\" way too many mistakes in technical documents. +.if n .ad l +.nh +.SH NAME +Unicode::UCD \- Unicode character database +.SH SYNOPSIS +.IX Header "SYNOPSIS" +.Vb 2 +\& use Unicode::UCD \*(Aqcharinfo\*(Aq; +\& my $charinfo = charinfo($codepoint); +\& +\& use Unicode::UCD \*(Aqcharprop\*(Aq; +\& my $value = charprop($codepoint, $property); +\& +\& use Unicode::UCD \*(Aqcharprops_all\*(Aq; +\& my $all_values_hash_ref = charprops_all($codepoint); +\& +\& use Unicode::UCD \*(Aqcasefold\*(Aq; +\& my $casefold = casefold($codepoint); +\& +\& use Unicode::UCD \*(Aqall_casefolds\*(Aq; +\& my $all_casefolds_ref = all_casefolds(); +\& +\& use Unicode::UCD \*(Aqcasespec\*(Aq; +\& my $casespec = casespec($codepoint); +\& +\& use Unicode::UCD \*(Aqcharblock\*(Aq; +\& my $charblock = charblock($codepoint); +\& +\& use Unicode::UCD \*(Aqcharscript\*(Aq; +\& my $charscript = charscript($codepoint); +\& +\& use Unicode::UCD \*(Aqcharblocks\*(Aq; +\& my $charblocks = charblocks(); +\& +\& use Unicode::UCD \*(Aqcharscripts\*(Aq; +\& my $charscripts = charscripts(); +\& +\& use Unicode::UCD qw(charscript charinrange); +\& my $range = charscript($script); +\& print "looks like $script\en" if charinrange($range, $codepoint); +\& +\& use Unicode::UCD qw(general_categories bidi_types); +\& my $categories = general_categories(); +\& my $types = bidi_types(); +\& +\& use Unicode::UCD \*(Aqprop_aliases\*(Aq; +\& my @space_names = prop_aliases("space"); +\& +\& use Unicode::UCD \*(Aqprop_value_aliases\*(Aq; +\& my @gc_punct_names = prop_value_aliases("Gc", "Punct"); +\& +\& use Unicode::UCD \*(Aqprop_values\*(Aq; +\& my @all_EA_short_names = prop_values("East_Asian_Width"); +\& +\& use Unicode::UCD \*(Aqprop_invlist\*(Aq; +\& my @puncts = prop_invlist("gc=punctuation"); +\& +\& use Unicode::UCD \*(Aqprop_invmap\*(Aq; +\& my ($list_ref, $map_ref, $format, $missing) +\& = prop_invmap("General Category"); +\& +\& use Unicode::UCD \*(Aqsearch_invlist\*(Aq; +\& my $index = search_invlist(\e@invlist, $code_point); +\& +\& # The following function should be used only internally in +\& # implementations of the Unicode Normalization Algorithm, and there +\& # are better choices than it. +\& use Unicode::UCD \*(Aqcompexcl\*(Aq; +\& my $compexcl = compexcl($codepoint); +\& +\& use Unicode::UCD \*(Aqnamedseq\*(Aq; +\& my $namedseq = namedseq($named_sequence_name); +\& +\& my $unicode_version = Unicode::UCD::UnicodeVersion(); +\& +\& my $convert_to_numeric = +\& Unicode::UCD::num("\eN{RUMI DIGIT ONE}\eN{RUMI DIGIT TWO}"); +.Ve +.SH DESCRIPTION +.IX Header "DESCRIPTION" +The Unicode::UCD module offers a series of functions that +provide a simple interface to the Unicode +Character Database. +.SS "code point argument" +.IX Subsection "code point argument" +Some of the functions are called with a \fIcode point argument\fR, which is either +a decimal or a hexadecimal scalar designating a code point in the platform's +native character set (extended to Unicode), or a string containing \f(CW\*(C`U+\*(C'\fR +followed by hexadecimals +designating a Unicode code point. A leading 0 will force a hexadecimal +interpretation, as will a hexadecimal digit that isn't a decimal digit. +.PP +Examples: +.PP +.Vb 6 +\& 223 # Decimal 223 in native character set +\& 0223 # Hexadecimal 223, native (= 547 decimal) +\& 0xDF # Hexadecimal DF, native (= 223 decimal) +\& \*(Aq0xDF\*(Aq # String form of hexadecimal (= 223 decimal) +\& \*(AqU+DF\*(Aq # Hexadecimal DF, in Unicode\*(Aqs character set +\& (= LATIN SMALL LETTER SHARP S) +.Ve +.PP +Note that the largest code point in Unicode is U+10FFFF. +.SS \fBcharinfo()\fP +.IX Subsection "charinfo()" +.Vb 1 +\& use Unicode::UCD \*(Aqcharinfo\*(Aq; +\& +\& my $charinfo = charinfo(0x41); +.Ve +.PP +This returns information about the input "code point argument" +as a reference to a hash of fields as defined by the Unicode +standard. If the "code point argument" is not assigned in the standard +(i.e., has the general category \f(CW\*(C`Cn\*(C'\fR meaning \f(CW\*(C`Unassigned\*(C'\fR) +or is a non-character (meaning it is guaranteed to never be assigned in +the standard), +\&\f(CW\*(C`undef\*(C'\fR is returned. +.PP +Fields that aren't applicable to the particular code point argument exist in the +returned hash, and are empty. +.PP +For results that are less "raw" than this function returns, or to get the values for +any property, not just the few covered by this function, use the +"\fBcharprop()\fR" function. +.PP +The keys in the hash with the meanings of their values are: +.IP \fBcode\fR 4 +.IX Item "code" +the input native "code point argument" expressed in hexadecimal, with +leading zeros +added if necessary to make it contain at least four hexdigits +.IP \fBname\fR 4 +.IX Item "name" +name of \fIcode\fR, all IN UPPER CASE. +Some control-type code points do not have names. +This field will be empty for \f(CW\*(C`Surrogate\*(C'\fR and \f(CW\*(C`Private Use\*(C'\fR code points, +and for the others without a name, +it will contain a description enclosed in angle brackets, like +\&\f(CW\*(C`<control>\*(C'\fR. +.IP \fBcategory\fR 4 +.IX Item "category" +The short name of the general category of \fIcode\fR. +This will match one of the keys in the hash returned by "\fBgeneral_categories()\fR". +.Sp +The "\fBprop_value_aliases()\fR" function can be used to get all the synonyms +of the category name. +.IP \fBcombining\fR 4 +.IX Item "combining" +the combining class number for \fIcode\fR used in the Canonical Ordering Algorithm. +For Unicode 5.1, this is described in Section 3.11 \f(CW\*(C`Canonical Ordering Behavior\*(C'\fR +available at +<http://www.unicode.org/versions/Unicode5.1.0/> +.Sp +The "\fBprop_value_aliases()\fR" function can be used to get all the synonyms +of the combining class number. +.IP \fBbidi\fR 4 +.IX Item "bidi" +bidirectional type of \fIcode\fR. +This will match one of the keys in the hash returned by "\fBbidi_types()\fR". +.Sp +The "\fBprop_value_aliases()\fR" function can be used to get all the synonyms +of the bidi type name. +.IP \fBdecomposition\fR 4 +.IX Item "decomposition" +is empty if \fIcode\fR has no decomposition; or is one or more codes +(separated by spaces) that, taken in order, represent a decomposition for +\&\fIcode\fR. Each has at least four hexdigits. +The codes may be preceded by a word enclosed in angle brackets, then a space, +like \f(CW\*(C`<compat> \*(C'\fR, giving the type of decomposition +.Sp +This decomposition may be an intermediate one whose components are also +decomposable. Use Unicode::Normalize to get the final decomposition in one +step. +.IP \fBdecimal\fR 4 +.IX Item "decimal" +if \fIcode\fR represents a decimal digit this is its integer numeric value +.IP \fBdigit\fR 4 +.IX Item "digit" +if \fIcode\fR represents some other digit-like number, this is its integer +numeric value +.IP \fBnumeric\fR 4 +.IX Item "numeric" +if \fIcode\fR represents a whole or rational number, this is its numeric value. +Rational values are expressed as a string like \f(CW\*(C`1/4\*(C'\fR. +.IP \fBmirrored\fR 4 +.IX Item "mirrored" +\&\f(CW\*(C`Y\*(C'\fR or \f(CW\*(C`N\*(C'\fR designating if \fIcode\fR is mirrored in bidirectional text +.IP \fBunicode10\fR 4 +.IX Item "unicode10" +name of \fIcode\fR in the Unicode 1.0 standard if one +existed for this code point and is different from the current name +.IP \fBcomment\fR 4 +.IX Item "comment" +As of Unicode 6.0, this is always empty. +.IP \fBupper\fR 4 +.IX Item "upper" +is, if non-empty, the uppercase mapping for \fIcode\fR expressed as at least four +hexdigits. This indicates that the full uppercase mapping is a single +character, and is identical to the simple (single-character only) mapping. +When this field is empty, it means that the simple uppercase mapping is +\&\fIcode\fR itself; you'll need some other means, (like "\fBcharprop()\fR" or +"\fBcasespec()\fR" to get the full mapping. +.IP \fBlower\fR 4 +.IX Item "lower" +is, if non-empty, the lowercase mapping for \fIcode\fR expressed as at least four +hexdigits. This indicates that the full lowercase mapping is a single +character, and is identical to the simple (single-character only) mapping. +When this field is empty, it means that the simple lowercase mapping is +\&\fIcode\fR itself; you'll need some other means, (like "\fBcharprop()\fR" or +"\fBcasespec()\fR" to get the full mapping. +.IP \fBtitle\fR 4 +.IX Item "title" +is, if non-empty, the titlecase mapping for \fIcode\fR expressed as at least four +hexdigits. This indicates that the full titlecase mapping is a single +character, and is identical to the simple (single-character only) mapping. +When this field is empty, it means that the simple titlecase mapping is +\&\fIcode\fR itself; you'll need some other means, (like "\fBcharprop()\fR" or +"\fBcasespec()\fR" to get the full mapping. +.IP \fBblock\fR 4 +.IX Item "block" +the block \fIcode\fR belongs to (used in \f(CW\*(C`\ep{Blk=...}\*(C'\fR). +The "\fBprop_value_aliases()\fR" function can be used to get all the synonyms +of the block name. +.Sp +See "Blocks versus Scripts". +.IP \fBscript\fR 4 +.IX Item "script" +the script \fIcode\fR belongs to. +The "\fBprop_value_aliases()\fR" function can be used to get all the synonyms +of the script name. Note that this is the older "Script" property value, and +not the improved "Script_Extensions" value. +.Sp +See "Blocks versus Scripts". +.PP +Note that you cannot do (de)composition and casing based solely on the +\&\fIdecomposition\fR, \fIcombining\fR, \fIlower\fR, \fIupper\fR, and \fItitle\fR fields; you +will need also the "\fBcasespec()\fR" function and the \f(CW\*(C`Composition_Exclusion\*(C'\fR +property. (Or you could just use the \fBlc()\fR, +\&\fBuc()\fR, and \fBucfirst()\fR functions, and the +Unicode::Normalize module.) +.SS \fBcharprop()\fP +.IX Subsection "charprop()" +.Vb 1 +\& use Unicode::UCD \*(Aqcharprop\*(Aq; +\& +\& print charprop(0x41, "Gc"), "\en"; +\& print charprop(0x61, "General_Category"), "\en"; +\& +\& prints +\& Lu +\& Ll +.Ve +.PP +This returns the value of the Unicode property given by the second parameter +for the "code point argument" given by the first. +.PP +The passed-in property may be specified as any of the synonyms returned by +"\fBprop_aliases()\fR". +.PP +The return value is always a scalar, either a string or a number. For +properties where there are synonyms for the values, the synonym returned by +this function is the longest, most descriptive form, the one returned by +"\fBprop_value_aliases()\fR" when called in a scalar context. Of course, you can +call "\fBprop_value_aliases()\fR" on the result to get other synonyms. +.PP +The return values are more "cooked" than the "\fBcharinfo()\fR" ones. For +example, the \f(CW"uc"\fR property value is the actual string containing the full +uppercase mapping of the input code point. You have to go to extra trouble +with \f(CW\*(C`charinfo\*(C'\fR to get this value from its \f(CW\*(C`upper\*(C'\fR hash element when the +full mapping differs from the simple one. +.PP +Special note should be made of the return values for a few properties: +.IP Block 4 +.IX Item "Block" +The value returned is the new-style (see "Old-style versus new-style block +names"). +.IP Decomposition_Mapping 4 +.IX Item "Decomposition_Mapping" +Like "\fBcharinfo()\fR", the result may be an intermediate decomposition whose +components are also decomposable. Use Unicode::Normalize to get the final +decomposition in one step. +.Sp +Unlike "\fBcharinfo()\fR", this does not include the decomposition type. Use the +\&\f(CW\*(C`Decomposition_Type\*(C'\fR property to get that. +.IP Name_Alias 4 +.IX Item "Name_Alias" +If the input code point's name has more than one synonym, they are returned +joined into a single comma-separated string. +.IP Numeric_Value 4 +.IX Item "Numeric_Value" +If the result is a fraction, it is converted into a floating point number to +the accuracy of your platform. +.IP Script_Extensions 4 +.IX Item "Script_Extensions" +If the result is multiple script names, they are returned joined into a single +comma-separated string. +.PP +When called with a property that is a Perl extension that isn't expressible in +a compound form, this function currently returns \f(CW\*(C`undef\*(C'\fR, as the only two +possible values are \fItrue\fR or \fIfalse\fR (1 or 0 I suppose). This behavior may +change in the future, so don't write code that relies on it. \f(CW\*(C`Present_In\*(C'\fR is +a Perl extension that is expressible in a bipartite or compound form (for +example, \f(CW\*(C`\ep{Present_In=4.0}\*(C'\fR), so \f(CW\*(C`charprop\*(C'\fR accepts it. But \f(CW\*(C`Any\*(C'\fR is a +Perl extension that isn't expressible that way, so \f(CW\*(C`charprop\*(C'\fR returns +\&\f(CW\*(C`undef\*(C'\fR for it. Also \f(CW\*(C`charprop\*(C'\fR returns \f(CW\*(C`undef\*(C'\fR for all Perl extensions +that are internal-only. +.SS \fBcharprops_all()\fP +.IX Subsection "charprops_all()" +.Vb 1 +\& use Unicode::UCD \*(Aqcharprops_all\*(Aq; +\& +\& my $%properties_of_A_hash_ref = charprops_all("U+41"); +.Ve +.PP +This returns a reference to a hash whose keys are all the distinct Unicode (no +Perl extension) properties, and whose values are the respective values for +those properties for the input "code point argument". +.PP +Each key is the property name in its longest, most descriptive form. The +values are what "\fBcharprop()\fR" would return. +.PP +This function is expensive in time and memory. +.SS \fBcharblock()\fP +.IX Subsection "charblock()" +.Vb 1 +\& use Unicode::UCD \*(Aqcharblock\*(Aq; +\& +\& my $charblock = charblock(0x41); +\& my $charblock = charblock(1234); +\& my $charblock = charblock(0x263a); +\& my $charblock = charblock("U+263a"); +\& +\& my $range = charblock(\*(AqArmenian\*(Aq); +.Ve +.PP +With a "code point argument" \f(CWcharblock()\fR returns the \fIblock\fR the code point +belongs to, e.g. \f(CW\*(C`Basic Latin\*(C'\fR. The old-style block name is returned (see +"Old-style versus new-style block names"). +The "\fBprop_value_aliases()\fR" function can be used to get all the synonyms +of the block name. +.PP +If the code point is unassigned, this returns the block it would belong to if +it were assigned. (If the Unicode version being used is so early as to not +have blocks, all code points are considered to be in \f(CW\*(C`No_Block\*(C'\fR.) +.PP +See also "Blocks versus Scripts". +.PP +If supplied with an argument that can't be a code point, \f(CWcharblock()\fR tries to +do the opposite and interpret the argument as an old-style block name. On an +ASCII platform, the return value is a \fIrange set\fR with one range: an +anonymous array with a single element that consists of another anonymous array +whose first element is the first code point in the block, and whose second +element is the final code point in the block. On an EBCDIC +platform, the first two Unicode blocks are not contiguous. Their range sets +are lists containing \fIstart-of-range\fR, \fIend-of-range\fR code point pairs. You +can test whether a code point is in a range set using the "\fBcharinrange()\fR" +function. (To be precise, each \fIrange set\fR contains a third array element, +after the range boundary ones: the old_style block name.) +.PP +If the argument to \f(CWcharblock()\fR is not a known block, \f(CW\*(C`undef\*(C'\fR is +returned. +.SS \fBcharscript()\fP +.IX Subsection "charscript()" +.Vb 1 +\& use Unicode::UCD \*(Aqcharscript\*(Aq; +\& +\& my $charscript = charscript(0x41); +\& my $charscript = charscript(1234); +\& my $charscript = charscript("U+263a"); +\& +\& my $range = charscript(\*(AqThai\*(Aq); +.Ve +.PP +With a "code point argument", \f(CWcharscript()\fR returns the \fIscript\fR the +code point belongs to, e.g., \f(CW\*(C`Latin\*(C'\fR, \f(CW\*(C`Greek\*(C'\fR, \f(CW\*(C`Han\*(C'\fR. +If the code point is unassigned or the Unicode version being used is so early +that it doesn't have scripts, this function returns \f(CW"Unknown"\fR. +The "\fBprop_value_aliases()\fR" function can be used to get all the synonyms +of the script name. +.PP +Note that the Script_Extensions property is an improved version of the Script +property, and you should probably be using that instead, with the +"\fBcharprop()\fR" function. +.PP +If supplied with an argument that can't be a code point, \fBcharscript()\fR tries +to do the opposite and interpret the argument as a script name. The +return value is a \fIrange set\fR: an anonymous array of arrays that contain +\&\fIstart-of-range\fR, \fIend-of-range\fR code point pairs. You can test whether a +code point is in a range set using the "\fBcharinrange()\fR" function. +(To be precise, each \fIrange set\fR contains a third array element, +after the range boundary ones: the script name.) +.PP +If the \f(CWcharscript()\fR argument is not a known script, \f(CW\*(C`undef\*(C'\fR is returned. +.PP +See also "Blocks versus Scripts". +.SS \fBcharblocks()\fP +.IX Subsection "charblocks()" +.Vb 1 +\& use Unicode::UCD \*(Aqcharblocks\*(Aq; +\& +\& my $charblocks = charblocks(); +.Ve +.PP +\&\f(CWcharblocks()\fR returns a reference to a hash with the known block names +as the keys, and the code point ranges (see "\fBcharblock()\fR") as the values. +.PP +The names are in the old-style (see "Old-style versus new-style block +names"). +.PP +prop_invmap("block") can be used to get this same data in a +different type of data structure. +.PP +prop_values("Block") can be used to get all +the known new-style block names as a list, without the code point ranges. +.PP +See also "Blocks versus Scripts". +.SS \fBcharscripts()\fP +.IX Subsection "charscripts()" +.Vb 1 +\& use Unicode::UCD \*(Aqcharscripts\*(Aq; +\& +\& my $charscripts = charscripts(); +.Ve +.PP +\&\f(CWcharscripts()\fR returns a reference to a hash with the known script +names as the keys, and the code point ranges (see "\fBcharscript()\fR") as +the values. +.PP +prop_invmap("script") can be used to get this same data in a +different type of data structure. Since the Script_Extensions property is an +improved version of the Script property, you should instead use +prop_invmap("scx"). +.PP +\&\f(CWprop_values("Script")\fR can be used to get all +the known script names as a list, without the code point ranges. +.PP +See also "Blocks versus Scripts". +.SS \fBcharinrange()\fP +.IX Subsection "charinrange()" +In addition to using the \f(CW\*(C`\ep{Blk=...}\*(C'\fR and \f(CW\*(C`\eP{Blk=...}\*(C'\fR constructs, you +can also test whether a code point is in the \fIrange\fR as returned by +"\fBcharblock()\fR" and "\fBcharscript()\fR" or as the values of the hash returned +by "\fBcharblocks()\fR" and "\fBcharscripts()\fR" by using \f(CWcharinrange()\fR: +.PP +.Vb 1 +\& use Unicode::UCD qw(charscript charinrange); +\& +\& $range = charscript(\*(AqHiragana\*(Aq); +\& print "looks like hiragana\en" if charinrange($range, $codepoint); +.Ve +.SS \fBgeneral_categories()\fP +.IX Subsection "general_categories()" +.Vb 1 +\& use Unicode::UCD \*(Aqgeneral_categories\*(Aq; +\& +\& my $categories = general_categories(); +.Ve +.PP +This returns a reference to a hash which has short +general category names (such as \f(CW\*(C`Lu\*(C'\fR, \f(CW\*(C`Nd\*(C'\fR, \f(CW\*(C`Zs\*(C'\fR, \f(CW\*(C`S\*(C'\fR) as keys and long +names (such as \f(CW\*(C`UppercaseLetter\*(C'\fR, \f(CW\*(C`DecimalNumber\*(C'\fR, \f(CW\*(C`SpaceSeparator\*(C'\fR, +\&\f(CW\*(C`Symbol\*(C'\fR) as values. The hash is reversible in case you need to go +from the long names to the short names. The general category is the +one returned from +"\fBcharinfo()\fR" under the \f(CW\*(C`category\*(C'\fR key. +.PP +The "\fBprop_values()\fR" and "\fBprop_value_aliases()\fR" functions can be used as an +alternative to this function; the first returning a simple list of the short +category names; and the second gets all the synonyms of a given category name. +.SS \fBbidi_types()\fP +.IX Subsection "bidi_types()" +.Vb 1 +\& use Unicode::UCD \*(Aqbidi_types\*(Aq; +\& +\& my $categories = bidi_types(); +.Ve +.PP +This returns a reference to a hash which has the short +bidi (bidirectional) type names (such as \f(CW\*(C`L\*(C'\fR, \f(CW\*(C`R\*(C'\fR) as keys and long +names (such as \f(CW\*(C`Left\-to\-Right\*(C'\fR, \f(CW\*(C`Right\-to\-Left\*(C'\fR) as values. The +hash is reversible in case you need to go from the long names to the +short names. The bidi type is the one returned from +"\fBcharinfo()\fR" +under the \f(CW\*(C`bidi\*(C'\fR key. For the exact meaning of the various bidi classes +the Unicode TR9 is recommended reading: +<http://www.unicode.org/reports/tr9/> +(as of Unicode 5.0.0) +.PP +The "\fBprop_values()\fR" and "\fBprop_value_aliases()\fR" functions can be used as an +alternative to this function; the first returning a simple list of the short +bidi type names; and the second gets all the synonyms of a given bidi type +name. +.SS \fBcompexcl()\fP +.IX Subsection "compexcl()" +WARNING: Unicode discourages the use of this function or any of the +alternative mechanisms listed in this section (the documentation of +\&\f(CWcompexcl()\fR), except internally in implementations of the Unicode +Normalization Algorithm. You should be using Unicode::Normalize directly +instead of these. Using these will likely lead to half-baked results. +.PP +.Vb 1 +\& use Unicode::UCD \*(Aqcompexcl\*(Aq; +\& +\& my $compexcl = compexcl(0x09dc); +.Ve +.PP +This routine returns \f(CW\*(C`undef\*(C'\fR if the Unicode version being used is so early +that it doesn't have this property. +.PP +\&\f(CWcompexcl()\fR is included for backwards +compatibility, but as of Perl 5.12 and more modern Unicode versions, for +most purposes it is probably more convenient to use one of the following +instead: +.PP +.Vb 2 +\& my $compexcl = chr(0x09dc) =~ /\ep{Comp_Ex}; +\& my $compexcl = chr(0x09dc) =~ /\ep{Full_Composition_Exclusion}; +.Ve +.PP +or even +.PP +.Vb 2 +\& my $compexcl = chr(0x09dc) =~ /\ep{CE}; +\& my $compexcl = chr(0x09dc) =~ /\ep{Composition_Exclusion}; +.Ve +.PP +The first two forms return \fBtrue\fR if the "code point argument" should not +be produced by composition normalization. For the final two forms to return +\&\fBtrue\fR, it is additionally required that this fact not otherwise be +determinable from the Unicode data base. +.PP +This routine behaves identically to the final two forms. That is, +it does not return \fBtrue\fR if the code point has a decomposition +consisting of another single code point, nor if its decomposition starts +with a code point whose combining class is non-zero. Code points that meet +either of these conditions should also not be produced by composition +normalization, which is probably why you should use the +\&\f(CW\*(C`Full_Composition_Exclusion\*(C'\fR property instead, as shown above. +.PP +The routine returns \fBfalse\fR otherwise. +.SS \fBcasefold()\fP +.IX Subsection "casefold()" +.Vb 1 +\& use Unicode::UCD \*(Aqcasefold\*(Aq; +\& +\& my $casefold = casefold(0xDF); +\& if (defined $casefold) { +\& my @full_fold_hex = split / /, $casefold\->{\*(Aqfull\*(Aq}; +\& my $full_fold_string = +\& join "", map {chr(hex($_))} @full_fold_hex; +\& my @turkic_fold_hex = +\& split / /, ($casefold\->{\*(Aqturkic\*(Aq} ne "") +\& ? $casefold\->{\*(Aqturkic\*(Aq} +\& : $casefold\->{\*(Aqfull\*(Aq}; +\& my $turkic_fold_string = +\& join "", map {chr(hex($_))} @turkic_fold_hex; +\& } +\& if (defined $casefold && $casefold\->{\*(Aqsimple\*(Aq} ne "") { +\& my $simple_fold_hex = $casefold\->{\*(Aqsimple\*(Aq}; +\& my $simple_fold_string = chr(hex($simple_fold_hex)); +\& } +.Ve +.PP +This returns the (almost) locale-independent case folding of the +character specified by the "code point argument". (Starting in Perl v5.16, +the core function \f(CWfc()\fR returns the \f(CW\*(C`full\*(C'\fR mapping (described below) +faster than this does, and for entire strings.) +.PP +If there is no case folding for the input code point, \f(CW\*(C`undef\*(C'\fR is returned. +.PP +If there is a case folding for that code point, a reference to a hash +with the following fields is returned: +.IP \fBcode\fR 4 +.IX Item "code" +the input native "code point argument" expressed in hexadecimal, with +leading zeros +added if necessary to make it contain at least four hexdigits +.IP \fBfull\fR 4 +.IX Item "full" +one or more codes (separated by spaces) that, taken in order, give the +code points for the case folding for \fIcode\fR. +Each has at least four hexdigits. +.IP \fBsimple\fR 4 +.IX Item "simple" +is empty, or is exactly one code with at least four hexdigits which can be used +as an alternative case folding when the calling program cannot cope with the +fold being a sequence of multiple code points. If \fIfull\fR is just one code +point, then \fIsimple\fR equals \fIfull\fR. If there is no single code point folding +defined for \fIcode\fR, then \fIsimple\fR is the empty string. Otherwise, it is an +inferior, but still better-than-nothing alternative folding to \fIfull\fR. +.IP \fBmapping\fR 4 +.IX Item "mapping" +is the same as \fIsimple\fR if \fIsimple\fR is not empty, and it is the same as \fIfull\fR +otherwise. It can be considered to be the simplest possible folding for +\&\fIcode\fR. It is defined primarily for backwards compatibility. +.IP \fBstatus\fR 4 +.IX Item "status" +is \f(CW\*(C`C\*(C'\fR (for \f(CW\*(C`common\*(C'\fR) if the best possible fold is a single code point +(\fIsimple\fR equals \fIfull\fR equals \fImapping\fR). It is \f(CW\*(C`S\*(C'\fR if there are distinct +folds, \fIsimple\fR and \fIfull\fR (\fImapping\fR equals \fIsimple\fR). And it is \f(CW\*(C`F\*(C'\fR if +there is only a \fIfull\fR fold (\fImapping\fR equals \fIfull\fR; \fIsimple\fR is empty). +Note that this +describes the contents of \fImapping\fR. It is defined primarily for backwards +compatibility. +.Sp +For Unicode versions between 3.1 and 3.1.1 inclusive, \fIstatus\fR can also be +\&\f(CW\*(C`I\*(C'\fR which is the same as \f(CW\*(C`C\*(C'\fR but is a special case for dotted uppercase I and +dotless lowercase i: +.RS 4 +.ie n .IP "\fB*\fR If you use this ""I"" mapping" 4 +.el .IP "\fB*\fR If you use this \f(CWI\fR mapping" 4 +.IX Item "* If you use this I mapping" +the result is case-insensitive, +but dotless and dotted I's are not distinguished +.ie n .IP "\fB*\fR If you exclude this ""I"" mapping" 4 +.el .IP "\fB*\fR If you exclude this \f(CWI\fR mapping" 4 +.IX Item "* If you exclude this I mapping" +the result is not fully case-insensitive, but +dotless and dotted I's are distinguished +.RE +.RS 4 +.RE +.IP \fBturkic\fR 4 +.IX Item "turkic" +contains any special folding for Turkic languages. For versions of Unicode +starting with 3.2, this field is empty unless \fIcode\fR has a different folding +in Turkic languages, in which case it is one or more codes (separated by +spaces) that, taken in order, give the code points for the case folding for +\&\fIcode\fR in those languages. +Each code has at least four hexdigits. +Note that this folding does not maintain canonical equivalence without +additional processing. +.Sp +For Unicode versions between 3.1 and 3.1.1 inclusive, this field is empty unless +there is a +special folding for Turkic languages, in which case \fIstatus\fR is \f(CW\*(C`I\*(C'\fR, and +\&\fImapping\fR, \fIfull\fR, \fIsimple\fR, and \fIturkic\fR are all equal. +.PP +Programs that want complete generality and the best folding results should use +the folding contained in the \fIfull\fR field. But note that the fold for some +code points will be a sequence of multiple code points. +.PP +Programs that can't cope with the fold mapping being multiple code points can +use the folding contained in the \fIsimple\fR field, with the loss of some +generality. In Unicode 5.1, about 7% of the defined foldings have no single +code point folding. +.PP +The \fImapping\fR and \fIstatus\fR fields are provided for backwards compatibility for +existing programs. They contain the same values as in previous versions of +this function. +.PP +Locale is not completely independent. The \fIturkic\fR field contains results to +use when the locale is a Turkic language. +.PP +For more information about case mappings see +<http://www.unicode.org/reports/tr21> +.SS \fBall_casefolds()\fP +.IX Subsection "all_casefolds()" +.Vb 1 +\& use Unicode::UCD \*(Aqall_casefolds\*(Aq; +\& +\& my $all_folds_ref = all_casefolds(); +\& foreach my $char_with_casefold (sort { $a <=> $b } +\& keys %$all_folds_ref) +\& { +\& printf "%04X:", $char_with_casefold; +\& my $casefold = $all_folds_ref\->{$char_with_casefold}; +\& +\& # Get folds for $char_with_casefold +\& +\& my @full_fold_hex = split / /, $casefold\->{\*(Aqfull\*(Aq}; +\& my $full_fold_string = +\& join "", map {chr(hex($_))} @full_fold_hex; +\& print " full=", join " ", @full_fold_hex; +\& my @turkic_fold_hex = +\& split / /, ($casefold\->{\*(Aqturkic\*(Aq} ne "") +\& ? $casefold\->{\*(Aqturkic\*(Aq} +\& : $casefold\->{\*(Aqfull\*(Aq}; +\& my $turkic_fold_string = +\& join "", map {chr(hex($_))} @turkic_fold_hex; +\& print "; turkic=", join " ", @turkic_fold_hex; +\& if (defined $casefold && $casefold\->{\*(Aqsimple\*(Aq} ne "") { +\& my $simple_fold_hex = $casefold\->{\*(Aqsimple\*(Aq}; +\& my $simple_fold_string = chr(hex($simple_fold_hex)); +\& print "; simple=$simple_fold_hex"; +\& } +\& print "\en"; +\& } +.Ve +.PP +This returns all the case foldings in the current version of Unicode in the +form of a reference to a hash. Each key to the hash is the decimal +representation of a Unicode character that has a casefold to other than +itself. The casefold of a semi-colon is itself, so it isn't in the hash; +likewise for a lowercase "a", but there is an entry for a capital "A". The +hash value for each key is another hash, identical to what is returned by +"\fBcasefold()\fR" if called with that code point as its argument. So the value +\&\f(CW\*(C`all_casefolds()\->{ord("A")}\*(Aq\*(C'\fR is equivalent to \f(CW\*(C`casefold(ord("A"))\*(C'\fR; +.SS \fBcasespec()\fP +.IX Subsection "casespec()" +.Vb 1 +\& use Unicode::UCD \*(Aqcasespec\*(Aq; +\& +\& my $casespec = casespec(0xFB00); +.Ve +.PP +This returns the potentially locale-dependent case mappings of the "code point +argument". The mappings may be longer than a single code point (which the basic +Unicode case mappings as returned by "\fBcharinfo()\fR" never are). +.PP +If there are no case mappings for the "code point argument", or if all three +possible mappings (\fIlower\fR, \fItitle\fR and \fIupper\fR) result in single code +points and are locale independent and unconditional, \f(CW\*(C`undef\*(C'\fR is returned +(which means that the case mappings, if any, for the code point are those +returned by "\fBcharinfo()\fR"). +.PP +Otherwise, a reference to a hash giving the mappings (or a reference to a hash +of such hashes, explained below) is returned with the following keys and their +meanings: +.PP +The keys in the bottom layer hash with the meanings of their values are: +.IP \fBcode\fR 4 +.IX Item "code" +the input native "code point argument" expressed in hexadecimal, with +leading zeros +added if necessary to make it contain at least four hexdigits +.IP \fBlower\fR 4 +.IX Item "lower" +one or more codes (separated by spaces) that, taken in order, give the +code points for the lower case of \fIcode\fR. +Each has at least four hexdigits. +.IP \fBtitle\fR 4 +.IX Item "title" +one or more codes (separated by spaces) that, taken in order, give the +code points for the title case of \fIcode\fR. +Each has at least four hexdigits. +.IP \fBupper\fR 4 +.IX Item "upper" +one or more codes (separated by spaces) that, taken in order, give the +code points for the upper case of \fIcode\fR. +Each has at least four hexdigits. +.IP \fBcondition\fR 4 +.IX Item "condition" +the conditions for the mappings to be valid. +If \f(CW\*(C`undef\*(C'\fR, the mappings are always valid. +When defined, this field is a list of conditions, +all of which must be true for the mappings to be valid. +The list consists of one or more +\&\fIlocales\fR (see below) +and/or \fIcontexts\fR (explained in the next paragraph), +separated by spaces. +(Other than as used to separate elements, spaces are to be ignored.) +Case distinctions in the condition list are not significant. +Conditions preceded by "NON_" represent the negation of the condition. +.Sp +A \fIcontext\fR is one of those defined in the Unicode standard. +For Unicode 5.1, they are defined in Section 3.13 \f(CW\*(C`Default Case Operations\*(C'\fR +available at +<http://www.unicode.org/versions/Unicode5.1.0/>. +These are for context-sensitive casing. +.PP +The hash described above is returned for locale-independent casing, where +at least one of the mappings has length longer than one. If \f(CW\*(C`undef\*(C'\fR is +returned, the code point may have mappings, but if so, all are length one, +and are returned by "\fBcharinfo()\fR". +Note that when this function does return a value, it will be for the complete +set of mappings for a code point, even those whose length is one. +.PP +If there are additional casing rules that apply only in certain locales, +an additional key for each will be defined in the returned hash. Each such key +will be its locale name, defined as a 2\-letter ISO 3166 country code, possibly +followed by a "_" and a 2\-letter ISO language code (possibly followed by a "_" +and a variant code). You can find the lists of all possible locales, see +Locale::Country and Locale::Language. +(In Unicode 6.0, the only locales returned by this function +are \f(CW\*(C`lt\*(C'\fR, \f(CW\*(C`tr\*(C'\fR, and \f(CW\*(C`az\*(C'\fR.) +.PP +Each locale key is a reference to a hash that has the form above, and gives +the casing rules for that particular locale, which take precedence over the +locale-independent ones when in that locale. +.PP +If the only casing for a code point is locale-dependent, then the returned +hash will not have any of the base keys, like \f(CW\*(C`code\*(C'\fR, \f(CW\*(C`upper\*(C'\fR, etc., but +will contain only locale keys. +.PP +For more information about case mappings see +<http://www.unicode.org/reports/tr21/> +.SS \fBnamedseq()\fP +.IX Subsection "namedseq()" +.Vb 1 +\& use Unicode::UCD \*(Aqnamedseq\*(Aq; +\& +\& my $namedseq = namedseq("KATAKANA LETTER AINU P"); +\& my @namedseq = namedseq("KATAKANA LETTER AINU P"); +\& my %namedseq = namedseq(); +.Ve +.PP +If used with a single argument in a scalar context, returns the string +consisting of the code points of the named sequence, or \f(CW\*(C`undef\*(C'\fR if no +named sequence by that name exists. If used with a single argument in +a list context, it returns the list of the ordinals of the code points. +.PP +If used with no +arguments in a list context, it returns a hash with the names of all the +named sequences as the keys and their sequences as strings as +the values. Otherwise, it returns \f(CW\*(C`undef\*(C'\fR or an empty list depending +on the context. +.PP +This function only operates on officially approved (not provisional) named +sequences. +.PP +Note that as of Perl 5.14, \f(CW\*(C`\eN{KATAKANA LETTER AINU P}\*(C'\fR will insert the named +sequence into double-quoted strings, and \f(CW\*(C`charnames::string_vianame("KATAKANA +LETTER AINU P")\*(C'\fR will return the same string this function does, but will also +operate on character names that aren't named sequences, without you having to +know which are which. See charnames. +.SS \fBnum()\fP +.IX Subsection "num()" +.Vb 1 +\& use Unicode::UCD \*(Aqnum\*(Aq; +\& +\& my $val = num("123"); +\& my $one_quarter = num("\eN{VULGAR FRACTION ONE QUARTER}"); +\& my $val = num("12a", \e$valid_length); # $valid_length contains 2 +.Ve +.PP +\&\f(CWnum()\fR returns the numeric value of the input Unicode string; or \f(CW\*(C`undef\*(C'\fR if it +doesn't think the entire string has a completely valid, safe numeric value. +If called with an optional second parameter, a reference to a scalar, \f(CWnum()\fR +will set the scalar to the length of any valid initial substring; or to 0 if none. +.PP +If the string is just one character in length, the Unicode numeric value +is returned if it has one, or \f(CW\*(C`undef\*(C'\fR otherwise. If the optional scalar ref +is passed, it would be set to 1 if the return is valid; or 0 if the return is +\&\f(CW\*(C`undef\*(C'\fR. Note that the numeric value returned need not be a whole number. +\&\f(CW\*(C`num("\eN{TIBETAN DIGIT HALF ZERO}")\*(C'\fR, for example returns \-0.5. +.PP +If the string is more than one character, \f(CW\*(C`undef\*(C'\fR is returned unless +all its characters are decimal digits (that is, they would match \f(CW\*(C`\ed+\*(C'\fR), +from the same script. For example if you have an ASCII '0' and a Bengali +\&'3', mixed together, they aren't considered a valid number, and \f(CW\*(C`undef\*(C'\fR +is returned. A further restriction is that the digits all have to be of +the same form. A half-width digit mixed with a full-width one will +return \f(CW\*(C`undef\*(C'\fR. The Arabic script has two sets of digits; \f(CW\*(C`num\*(C'\fR will +return \f(CW\*(C`undef\*(C'\fR unless all the digits in the string come from the same +set. In all cases, the optional scalar ref parameter is set to how +long any valid initial substring of digits is; hence it will be set to the +entire string length if the main return value is not \f(CW\*(C`undef\*(C'\fR. +.PP +\&\f(CW\*(C`num\*(C'\fR errs on the side of safety, and there may be valid strings of +decimal digits that it doesn't recognize. Note that Unicode defines +a number of "digit" characters that aren't "decimal digit" characters. +"Decimal digits" have the property that they have a positional value, i.e., +there is a units position, a 10's position, a 100's, etc, AND they are +arranged in Unicode in blocks of 10 contiguous code points. The Chinese +digits, for example, are not in such a contiguous block, and so Unicode +doesn't view them as decimal digits, but merely digits, and so \f(CW\*(C`\ed\*(C'\fR will not +match them. A single-character string containing one of these digits will +have its decimal value returned by \f(CW\*(C`num\*(C'\fR, but any longer string containing +only these digits will return \f(CW\*(C`undef\*(C'\fR. +.PP +Strings of multiple sub\- and superscripts are not recognized as numbers. You +can use either of the compatibility decompositions in Unicode::Normalize to +change these into digits, and then call \f(CW\*(C`num\*(C'\fR on the result. +.SS \fBprop_aliases()\fP +.IX Subsection "prop_aliases()" +.Vb 1 +\& use Unicode::UCD \*(Aqprop_aliases\*(Aq; +\& +\& my ($short_name, $full_name, @other_names) = prop_aliases("space"); +\& my $same_full_name = prop_aliases("Space"); # Scalar context +\& my ($same_short_name) = prop_aliases("Space"); # gets 0th element +\& print "The full name is $full_name\en"; +\& print "The short name is $short_name\en"; +\& print "The other aliases are: ", join(", ", @other_names), "\en"; +\& +\& prints: +\& The full name is White_Space +\& The short name is WSpace +\& The other aliases are: Space +.Ve +.PP +Most Unicode properties have several synonymous names. Typically, there is at +least a short name, convenient to type, and a long name that more fully +describes the property, and hence is more easily understood. +.PP +If you know one name for a Unicode property, you can use \f(CW\*(C`prop_aliases\*(C'\fR to find +either the long name (when called in scalar context), or a list of all of the +names, somewhat ordered so that the short name is in the 0th element, the long +name in the next element, and any other synonyms are in the remaining +elements, in no particular order. +.PP +The long name is returned in a form nicely capitalized, suitable for printing. +.PP +The input parameter name is loosely matched, which means that white space, +hyphens, and underscores are ignored (except for the trailing underscore in +the old_form grandfathered-in \f(CW"L_"\fR, which is better written as \f(CW"LC"\fR, and +both of which mean \f(CW\*(C`General_Category=Cased Letter\*(C'\fR). +.PP +If the name is unknown, \f(CW\*(C`undef\*(C'\fR is returned (or an empty list in list +context). Note that Perl typically recognizes property names in regular +expressions with an optional \f(CW\*(C`"Is_\*(C'\fR" (with or without the underscore) +prefixed to them, such as \f(CW\*(C`\ep{isgc=punct}\*(C'\fR. This function does not recognize +those in the input, returning \f(CW\*(C`undef\*(C'\fR. Nor are they included in the output +as possible synonyms. +.PP +\&\f(CW\*(C`prop_aliases\*(C'\fR does know about the Perl extensions to Unicode properties, +such as \f(CW\*(C`Any\*(C'\fR and \f(CW\*(C`XPosixAlpha\*(C'\fR, and the single form equivalents to Unicode +properties such as \f(CW\*(C`XDigit\*(C'\fR, \f(CW\*(C`Greek\*(C'\fR, \f(CW\*(C`In_Greek\*(C'\fR, and \f(CW\*(C`Is_Greek\*(C'\fR. The +final example demonstrates that the \f(CW"Is_"\fR prefix is recognized for these +extensions; it is needed to resolve ambiguities. For example, +\&\f(CWprop_aliases(\*(Aqlc\*(Aq)\fR returns the list \f(CW\*(C`(lc, Lowercase_Mapping)\*(C'\fR, but +\&\f(CWprop_aliases(\*(Aqislc\*(Aq)\fR returns \f(CW\*(C`(Is_LC, Cased_Letter)\*(C'\fR. This is +because \f(CW\*(C`islc\*(C'\fR is a Perl extension which is short for +\&\f(CW\*(C`General_Category=Cased Letter\*(C'\fR. The lists returned for the Perl extensions +will not include the \f(CW"Is_"\fR prefix (whether or not the input had it) unless +needed to resolve ambiguities, as shown in the \f(CW"islc"\fR example, where the +returned list had one element containing \f(CW"Is_"\fR, and the other without. +.PP +It is also possible for the reverse to happen: \f(CWprop_aliases(\*(Aqisc\*(Aq)\fR returns +the list \f(CW\*(C`(isc, ISO_Comment)\*(C'\fR; whereas \f(CWprop_aliases(\*(Aqc\*(Aq)\fR returns +\&\f(CW\*(C`(C, Other)\*(C'\fR (the latter being a Perl extension meaning +\&\f(CW\*(C`General_Category=Other\*(C'\fR. +"Properties accessible through Unicode::UCD" in perluniprops lists the available +forms, including which ones are discouraged from use. +.PP +Those discouraged forms are accepted as input to \f(CW\*(C`prop_aliases\*(C'\fR, but are not +returned in the lists. \f(CWprop_aliases(\*(AqisL&\*(Aq)\fR and \f(CWprop_aliases(\*(AqisL_\*(Aq)\fR, +which are old synonyms for \f(CW"Is_LC"\fR and should not be used in new code, are +examples of this. These both return \f(CW\*(C`(Is_LC, Cased_Letter)\*(C'\fR. Thus this +function allows you to take a discouraged form, and find its acceptable +alternatives. The same goes with single-form Block property equivalences. +Only the forms that begin with \f(CW"In_"\fR are not discouraged; if you pass +\&\f(CW\*(C`prop_aliases\*(C'\fR a discouraged form, you will get back the equivalent ones that +begin with \f(CW"In_"\fR. It will otherwise look like a new-style block name (see. +"Old-style versus new-style block names"). +.PP +\&\f(CW\*(C`prop_aliases\*(C'\fR does not know about any user-defined properties, and will +return \f(CW\*(C`undef\*(C'\fR if called with one of those. Likewise for Perl internal +properties, with the exception of "Perl_Decimal_Digit" which it does know +about (and which is documented below in "\fBprop_invmap()\fR"). +.SS \fBprop_values()\fP +.IX Subsection "prop_values()" +.Vb 1 +\& use Unicode::UCD \*(Aqprop_values\*(Aq; +\& +\& print "AHex values are: ", join(", ", prop_values("AHex")), +\& "\en"; +\& prints: +\& AHex values are: N, Y +.Ve +.PP +Some Unicode properties have a restricted set of legal values. For example, +all binary properties are restricted to just \f(CW\*(C`true\*(C'\fR or \f(CW\*(C`false\*(C'\fR; and there +are only a few dozen possible General Categories. Use \f(CW\*(C`prop_values\*(C'\fR +to find out if a given property is one such, and if so, to get a list of the +values: +.PP +.Vb 3 +\& print join ", ", prop_values("NFC_Quick_Check"); +\& prints: +\& M, N, Y +.Ve +.PP +If the property doesn't have such a restricted set, \f(CW\*(C`undef\*(C'\fR is returned. +.PP +There are usually several synonyms for each possible value. Use +"\fBprop_value_aliases()\fR" to access those. +.PP +Case, white space, hyphens, and underscores are ignored in the input property +name (except for the trailing underscore in the old-form grandfathered-in +general category property value \f(CW"L_"\fR, which is better written as \f(CW"LC"\fR). +.PP +If the property name is unknown, \f(CW\*(C`undef\*(C'\fR is returned. Note that Perl typically +recognizes property names in regular expressions with an optional \f(CW\*(C`"Is_\*(C'\fR" +(with or without the underscore) prefixed to them, such as \f(CW\*(C`\ep{isgc=punct}\*(C'\fR. +This function does not recognize those in the property parameter, returning +\&\f(CW\*(C`undef\*(C'\fR. +.PP +For the block property, new-style block names are returned (see +"Old-style versus new-style block names"). +.PP +\&\f(CW\*(C`prop_values\*(C'\fR does not know about any user-defined properties, and +will return \f(CW\*(C`undef\*(C'\fR if called with one of those. +.SS \fBprop_value_aliases()\fP +.IX Subsection "prop_value_aliases()" +.Vb 1 +\& use Unicode::UCD \*(Aqprop_value_aliases\*(Aq; +\& +\& my ($short_name, $full_name, @other_names) +\& = prop_value_aliases("Gc", "Punct"); +\& my $same_full_name = prop_value_aliases("Gc", "P"); # Scalar cntxt +\& my ($same_short_name) = prop_value_aliases("Gc", "P"); # gets 0th +\& # element +\& print "The full name is $full_name\en"; +\& print "The short name is $short_name\en"; +\& print "The other aliases are: ", join(", ", @other_names), "\en"; +\& +\& prints: +\& The full name is Punctuation +\& The short name is P +\& The other aliases are: Punct +.Ve +.PP +Some Unicode properties have a restricted set of legal values. For example, +all binary properties are restricted to just \f(CW\*(C`true\*(C'\fR or \f(CW\*(C`false\*(C'\fR; and there +are only a few dozen possible General Categories. +.PP +You can use "\fBprop_values()\fR" to find out if a given property is one which has +a restricted set of values, and if so, what those values are. But usually +each value actually has several synonyms. For example, in Unicode binary +properties, \fItruth\fR can be represented by any of the strings "Y", "Yes", "T", +or "True"; and the General Category "Punctuation" by that string, or "Punct", +or simply "P". +.PP +Like property names, there is typically at least a short name for each such +property-value, and a long name. If you know any name of the property-value +(which you can get by "\fBprop_values()\fR", you can use \f(CW\*(C`prop_value_aliases\*(C'\fR() +to get the long name (when called in scalar context), or a list of all the +names, with the short name in the 0th element, the long name in the next +element, and any other synonyms in the remaining elements, in no particular +order, except that any all-numeric synonyms will be last. +.PP +The long name is returned in a form nicely capitalized, suitable for printing. +.PP +Case, white space, hyphens, and underscores are ignored in the input parameters +(except for the trailing underscore in the old-form grandfathered-in general +category property value \f(CW"L_"\fR, which is better written as \f(CW"LC"\fR). +.PP +If either name is unknown, \f(CW\*(C`undef\*(C'\fR is returned. Note that Perl typically +recognizes property names in regular expressions with an optional \f(CW\*(C`"Is_\*(C'\fR" +(with or without the underscore) prefixed to them, such as \f(CW\*(C`\ep{isgc=punct}\*(C'\fR. +This function does not recognize those in the property parameter, returning +\&\f(CW\*(C`undef\*(C'\fR. +.PP +If called with a property that doesn't have synonyms for its values, it +returns the input value, possibly normalized with capitalization and +underscores, but not necessarily checking that the input value is valid. +.PP +For the block property, new-style block names are returned (see +"Old-style versus new-style block names"). +.PP +To find the synonyms for single-forms, such as \f(CW\*(C`\ep{Any}\*(C'\fR, use +"\fBprop_aliases()\fR" instead. +.PP +\&\f(CW\*(C`prop_value_aliases\*(C'\fR does not know about any user-defined properties, and +will return \f(CW\*(C`undef\*(C'\fR if called with one of those. +.SS \fBprop_invlist()\fP +.IX Subsection "prop_invlist()" +\&\f(CW\*(C`prop_invlist\*(C'\fR returns an inversion list (described below) that defines all the +code points for the binary Unicode property (or "property=value" pair) given +by the input parameter string: +.PP +.Vb 3 +\& use feature \*(Aqsay\*(Aq; +\& use Unicode::UCD \*(Aqprop_invlist\*(Aq; +\& say join ", ", prop_invlist("Any"); +\& +\& prints: +\& 0, 1114112 +.Ve +.PP +If the input is unknown \f(CW\*(C`undef\*(C'\fR is returned in scalar context; an empty-list +in list context. If the input is known, the number of elements in +the list is returned if called in scalar context. +.PP +perluniprops gives +the list of properties that this function accepts, as well as all the possible +forms for them (including with the optional "Is_" prefixes). (Except this +function doesn't accept any Perl-internal properties, some of which are listed +there.) This function uses the same loose or tighter matching rules for +resolving the input property's name as is done for regular expressions. These +are also specified in perluniprops. Examples of using the "property=value" form are: +.PP +.Vb 1 +\& say join ", ", prop_invlist("Script_Extensions=Shavian"); +\& +\& prints: +\& 66640, 66688 +\& +\& say join ", ", prop_invlist("ASCII_Hex_Digit=No"); +\& +\& prints: +\& 0, 48, 58, 65, 71, 97, 103 +\& +\& say join ", ", prop_invlist("ASCII_Hex_Digit=Yes"); +\& +\& prints: +\& 48, 58, 65, 71, 97, 103 +.Ve +.PP +Inversion lists are a compact way of specifying Unicode property-value +definitions. The 0th item in the list is the lowest code point that has the +property-value. The next item (item [1]) is the lowest code point beyond that +one that does NOT have the property-value. And the next item beyond that +([2]) is the lowest code point beyond that one that does have the +property-value, and so on. Put another way, each element in the list gives +the beginning of a range that has the property-value (for even numbered +elements), or doesn't have the property-value (for odd numbered elements). +The name for this data structure stems from the fact that each element in the +list toggles (or inverts) whether the corresponding range is or isn't on the +list. +.PP +In the final example above, the first ASCII Hex digit is code point 48, the +character "0", and all code points from it through 57 (a "9") are ASCII hex +digits. Code points 58 through 64 aren't, but 65 (an "A") through 70 (an "F") +are, as are 97 ("a") through 102 ("f"). 103 starts a range of code points +that aren't ASCII hex digits. That range extends to infinity, which on your +computer can be found in the variable \f(CW$Unicode::UCD::MAX_CP\fR. (This +variable is as close to infinity as Perl can get on your platform, and may be +too high for some operations to work; you may wish to use a smaller number for +your purposes.) +.PP +Note that the inversion lists returned by this function can possibly include +non-Unicode code points, that is anything above 0x10FFFF. Unicode properties +are not defined on such code points. You might wish to change the output to +not include these. Simply add 0x110000 at the end of the non-empty returned +list if it isn't already that value; and pop that value if it is; like: +.PP +.Vb 9 +\& my @list = prop_invlist("foo"); +\& if (@list) { +\& if ($list[\-1] == 0x110000) { +\& pop @list; # Defeat the turning on for above Unicode +\& } +\& else { +\& push @list, 0x110000; # Turn off for above Unicode +\& } +\& } +.Ve +.PP +It is a simple matter to expand out an inversion list to a full list of all +code points that have the property-value: +.PP +.Vb 11 +\& my @invlist = prop_invlist($property_name); +\& die "empty" unless @invlist; +\& my @full_list; +\& for (my $i = 0; $i < @invlist; $i += 2) { +\& my $upper = ($i + 1) < @invlist +\& ? $invlist[$i+1] \- 1 # In range +\& : $Unicode::UCD::MAX_CP; # To infinity. +\& for my $j ($invlist[$i] .. $upper) { +\& push @full_list, $j; +\& } +\& } +.Ve +.PP +\&\f(CW\*(C`prop_invlist\*(C'\fR does not know about any user-defined nor Perl internal-only +properties, and will return \f(CW\*(C`undef\*(C'\fR if called with one of those. +.PP +The "\fBsearch_invlist()\fR" function is provided for finding a code point within +an inversion list. +.SS \fBprop_invmap()\fP +.IX Subsection "prop_invmap()" +.Vb 3 +\& use Unicode::UCD \*(Aqprop_invmap\*(Aq; +\& my ($list_ref, $map_ref, $format, $default) +\& = prop_invmap("General Category"); +.Ve +.PP +\&\f(CW\*(C`prop_invmap\*(C'\fR is used to get the complete mapping definition for a property, +in the form of an inversion map. An inversion map consists of two parallel +arrays. One is an ordered list of code points that mark range beginnings, and +the other gives the value (or mapping) that all code points in the +corresponding range have. +.PP +\&\f(CW\*(C`prop_invmap\*(C'\fR is called with the name of the desired property. The name is +loosely matched, meaning that differences in case, white-space, hyphens, and +underscores are not meaningful (except for the trailing underscore in the +old-form grandfathered-in property \f(CW"L_"\fR, which is better written as \f(CW"LC"\fR, +or even better, \f(CW"Gc=LC"\fR). +.PP +Many Unicode properties have more than one name (or alias). \f(CW\*(C`prop_invmap\*(C'\fR +understands all of these, including Perl extensions to them. Ambiguities are +resolved as described above for "\fBprop_aliases()\fR" (except if a property has +both a complete mapping, and a binary \f(CW\*(C`Y\*(C'\fR/\f(CW\*(C`N\*(C'\fR mapping, then specifying the +property name prefixed by \f(CW"is"\fR causes the binary one to be returned). The +Perl internal property "Perl_Decimal_Digit, described below, is also accepted. +An empty list is returned if the property name is unknown. +See "Properties accessible through Unicode::UCD" in perluniprops for the +properties acceptable as inputs to this function. +.PP +It is a fatal error to call this function except in list context. +.PP +In addition to the two arrays that form the inversion map, \f(CW\*(C`prop_invmap\*(C'\fR +returns two other values; one is a scalar that gives some details as to the +format of the entries of the map array; the other is a default value, useful +in maps whose format name begins with the letter \f(CW"a"\fR, as described +below in its subsection; and for specialized purposes, such as +converting to another data structure, described at the end of this main +section. +.PP +This means that \f(CW\*(C`prop_invmap\*(C'\fR returns a 4 element list. For example, +.PP +.Vb 2 +\& my ($blocks_ranges_ref, $blocks_maps_ref, $format, $default) +\& = prop_invmap("Block"); +.Ve +.PP +In this call, the two arrays will be populated as shown below (for Unicode +6.0): +.PP +.Vb 10 +\& Index @blocks_ranges @blocks_maps +\& 0 0x0000 Basic Latin +\& 1 0x0080 Latin\-1 Supplement +\& 2 0x0100 Latin Extended\-A +\& 3 0x0180 Latin Extended\-B +\& 4 0x0250 IPA Extensions +\& 5 0x02B0 Spacing Modifier Letters +\& 6 0x0300 Combining Diacritical Marks +\& 7 0x0370 Greek and Coptic +\& 8 0x0400 Cyrillic +\& ... +\& 233 0x2B820 No_Block +\& 234 0x2F800 CJK Compatibility Ideographs Supplement +\& 235 0x2FA20 No_Block +\& 236 0xE0000 Tags +\& 237 0xE0080 No_Block +\& 238 0xE0100 Variation Selectors Supplement +\& 239 0xE01F0 No_Block +\& 240 0xF0000 Supplementary Private Use Area\-A +\& 241 0x100000 Supplementary Private Use Area\-B +\& 242 0x110000 No_Block +.Ve +.PP +The first line (with Index [0]) means that the value for code point 0 is "Basic +Latin". The entry "0x0080" in the \f(CW@blocks_ranges\fR column in the second line +means that the value from the first line, "Basic Latin", extends to all code +points in the range from 0 up to but not including 0x0080, that is, through +127. In other words, the code points from 0 to 127 are all in the "Basic +Latin" block. Similarly, all code points in the range from 0x0080 up to (but +not including) 0x0100 are in the block named "Latin\-1 Supplement", etc. +(Notice that the return is the old-style block names; see "Old-style versus +new-style block names"). +.PP +The final line (with Index [242]) means that the value for all code points above +the legal Unicode maximum code point have the value "No_Block", which is the +term Unicode uses for a non-existing block. +.PP +The arrays completely specify the mappings for all possible code points. +The final element in an inversion map returned by this function will always be +for the range that consists of all the code points that aren't legal Unicode, +but that are expressible on the platform. (That is, it starts with code point +0x110000, the first code point above the legal Unicode maximum, and extends to +infinity.) The value for that range will be the same that any typical +unassigned code point has for the specified property. (Certain unassigned +code points are not "typical"; for example the non-character code points, or +those in blocks that are to be written right-to-left. The above-Unicode +range's value is not based on these atypical code points.) It could be argued +that, instead of treating these as unassigned Unicode code points, the value +for this range should be \f(CW\*(C`undef\*(C'\fR. If you wish, you can change the returned +arrays accordingly. +.PP +The maps for almost all properties are simple scalars that should be +interpreted as-is. +These values are those given in the Unicode-supplied data files, which may be +inconsistent as to capitalization and as to which synonym for a property-value +is given. The results may be normalized by using the "\fBprop_value_aliases()\fR" +function. +.PP +There are exceptions to the simple scalar maps. Some properties have some +elements in their map list that are themselves lists of scalars; and some +special strings are returned that are not to be interpreted as-is. Element +[2] (placed into \f(CW$format\fR in the example above) of the returned four element +list tells you if the map has any of these special elements or not, as follows: +.ie n .IP "\fR\fB""s""\fR\fB\fR" 4 +.el .IP \fR\f(CBs\fR\fB\fR 4 +.IX Item "s" +means all the elements of the map array are simple scalars, with no special +elements. Almost all properties are like this, like the \f(CW\*(C`block\*(C'\fR example +above. +.ie n .IP "\fR\fB""sl""\fR\fB\fR" 4 +.el .IP \fR\f(CBsl\fR\fB\fR 4 +.IX Item "sl" +means that some of the map array elements have the form given by \f(CW"s"\fR, and +the rest are lists of scalars. For example, here is a portion of the output +of calling \f(CW\*(C`prop_invmap\*(C'\fR() with the "Script Extensions" property: +.Sp +.Vb 6 +\& @scripts_ranges @scripts_maps +\& ... +\& 0x0953 Devanagari +\& 0x0964 [ Bengali, Devanagari, Gurumukhi, Oriya ] +\& 0x0966 Devanagari +\& 0x0970 Common +.Ve +.Sp +Here, the code points 0x964 and 0x965 are both used in Bengali, +Devanagari, Gurmukhi, and Oriya, but no other scripts. +.Sp +The Name_Alias property is also of this form. But each scalar consists of two +components: 1) the name, and 2) the type of alias this is. They are +separated by a colon and a space. In Unicode 6.1, there are several alias types: +.RS 4 +.ie n .IP """correction""" 4 +.el .IP \f(CWcorrection\fR 4 +.IX Item "correction" +indicates that the name is a corrected form for the +original name (which remains valid) for the same code point. +.ie n .IP """control""" 4 +.el .IP \f(CWcontrol\fR 4 +.IX Item "control" +adds a new name for a control character. +.ie n .IP """alternate""" 4 +.el .IP \f(CWalternate\fR 4 +.IX Item "alternate" +is an alternate name for a character +.ie n .IP """figment""" 4 +.el .IP \f(CWfigment\fR 4 +.IX Item "figment" +is a name for a character that has been documented but was never in any +actual standard. +.ie n .IP """abbreviation""" 4 +.el .IP \f(CWabbreviation\fR 4 +.IX Item "abbreviation" +is a common abbreviation for a character +.RE +.RS 4 +.Sp +The lists are ordered (roughly) so the most preferred names come before less +preferred ones. +.Sp +For example, +.Sp +.Vb 10 +\& @aliases_ranges @alias_maps +\& ... +\& 0x009E [ \*(AqPRIVACY MESSAGE: control\*(Aq, \*(AqPM: abbreviation\*(Aq ] +\& 0x009F [ \*(AqAPPLICATION PROGRAM COMMAND: control\*(Aq, +\& \*(AqAPC: abbreviation\*(Aq +\& ] +\& 0x00A0 \*(AqNBSP: abbreviation\*(Aq +\& 0x00A1 "" +\& 0x00AD \*(AqSHY: abbreviation\*(Aq +\& 0x00AE "" +\& 0x01A2 \*(AqLATIN CAPITAL LETTER GHA: correction\*(Aq +\& 0x01A3 \*(AqLATIN SMALL LETTER GHA: correction\*(Aq +\& 0x01A4 "" +\& ... +.Ve +.Sp +A map to the empty string means that there is no alias defined for the code +point. +.RE +.ie n .IP "\fR\fB""a""\fR\fB\fR" 4 +.el .IP \fR\f(CBa\fR\fB\fR 4 +.IX Item "a" +is like \f(CW"s"\fR in that all the map array elements are scalars, but here they are +restricted to all being integers, and some have to be adjusted (hence the name +\&\f(CW"a"\fR) to get the correct result. For example, in: +.Sp +.Vb 2 +\& my ($uppers_ranges_ref, $uppers_maps_ref, $format, $default) +\& = prop_invmap("Simple_Uppercase_Mapping"); +.Ve +.Sp +the returned arrays look like this: +.Sp +.Vb 7 +\& @$uppers_ranges_ref @$uppers_maps_ref Note +\& 0 0 +\& 97 65 \*(Aqa\*(Aq maps to \*(AqA\*(Aq, b => B ... +\& 123 0 +\& 181 924 MICRO SIGN => Greek Cap MU +\& 182 0 +\& ... +.Ve +.Sp +and \f(CW$default\fR is 0. +.Sp +Let's start with the second line. It says that the uppercase of code point 97 +is 65; or \f(CWuc("a")\fR == "A". But the line is for the entire range of code +points 97 through 122. To get the mapping for any code point in this range, +you take the offset it has from the beginning code point of the range, and add +that to the mapping for that first code point. So, the mapping for 122 ("z") +is derived by taking the offset of 122 from 97 (=25) and adding that to 65, +yielding 90 ("Z"). Likewise for everything in between. +.Sp +Requiring this simple adjustment allows the returned arrays to be +significantly smaller than otherwise, up to a factor of 10, speeding up +searching through them. +.Sp +Ranges that map to \f(CW$default\fR, \f(CW"0"\fR, behave somewhat differently. For +these, each code point maps to itself. So, in the first line in the example, +\&\f(CW\*(C`ord(uc(chr(0)))\*(C'\fR is 0, \f(CW\*(C`ord(uc(chr(1)))\*(C'\fR is 1, .. +\&\f(CW\*(C`ord(uc(chr(96)))\*(C'\fR is 96. +.ie n .IP "\fR\fB""al""\fR\fB\fR" 4 +.el .IP \fR\f(CBal\fR\fB\fR 4 +.IX Item "al" +means that some of the map array elements have the form given by \f(CW"a"\fR, and +the rest are ordered lists of code points. +For example, in: +.Sp +.Vb 2 +\& my ($uppers_ranges_ref, $uppers_maps_ref, $format, $default) +\& = prop_invmap("Uppercase_Mapping"); +.Ve +.Sp +the returned arrays look like this: +.Sp +.Vb 11 +\& @$uppers_ranges_ref @$uppers_maps_ref +\& 0 0 +\& 97 65 +\& 123 0 +\& 181 924 +\& 182 0 +\& ... +\& 0x0149 [ 0x02BC 0x004E ] +\& 0x014A 0 +\& 0x014B 330 +\& ... +.Ve +.Sp +This is the full Uppercase_Mapping property (as opposed to the +Simple_Uppercase_Mapping given in the example for format \f(CW"a"\fR). The only +difference between the two in the ranges shown is that the code point at +0x0149 (LATIN SMALL LETTER N PRECEDED BY APOSTROPHE) maps to a string of two +characters, 0x02BC (MODIFIER LETTER APOSTROPHE) followed by 0x004E (LATIN +CAPITAL LETTER N). +.Sp +No adjustments are needed to entries that are references to arrays; each such +entry will have exactly one element in its range, so the offset is always 0. +.Sp +The fourth (index [3]) element (\f(CW$default\fR) in the list returned for this +format is 0. +.ie n .IP "\fR\fB""ae""\fR\fB\fR" 4 +.el .IP \fR\f(CBae\fR\fB\fR 4 +.IX Item "ae" +This is like \f(CW"a"\fR, but some elements are the empty string, and should not be +adjusted. +The one internal Perl property accessible by \f(CW\*(C`prop_invmap\*(C'\fR is of this type: +"Perl_Decimal_Digit" returns an inversion map which gives the numeric values +that are represented by the Unicode decimal digit characters. Characters that +don't represent decimal digits map to the empty string, like so: +.Sp +.Vb 12 +\& @digits @values +\& 0x0000 "" +\& 0x0030 0 +\& 0x003A: "" +\& 0x0660: 0 +\& 0x066A: "" +\& 0x06F0: 0 +\& 0x06FA: "" +\& 0x07C0: 0 +\& 0x07CA: "" +\& 0x0966: 0 +\& ... +.Ve +.Sp +This means that the code points from 0 to 0x2F do not represent decimal digits; +the code point 0x30 (DIGIT ZERO) represents 0; code point 0x31, (DIGIT ONE), +represents 0+1\-0 = 1; ... code point 0x39, (DIGIT NINE), represents 0+9\-0 = 9; +\&... code points 0x3A through 0x65F do not represent decimal digits; 0x660 +(ARABIC-INDIC DIGIT ZERO), represents 0; ... 0x07C1 (NKO DIGIT ONE), +represents 0+1\-0 = 1 ... +.Sp +The fourth (index [3]) element (\f(CW$default\fR) in the list returned for this +format is the empty string. +.ie n .IP "\fR\fB""ale""\fR\fB\fR" 4 +.el .IP \fR\f(CBale\fR\fB\fR 4 +.IX Item "ale" +is a combination of the \f(CW"al"\fR type and the \f(CW"ae"\fR type. Some of +the map array elements have the forms given by \f(CW"al"\fR, and +the rest are the empty string. The property \f(CW\*(C`NFKC_Casefold\*(C'\fR has this form. +An example slice is: +.Sp +.Vb 9 +\& @$ranges_ref @$maps_ref Note +\& ... +\& 0x00AA 97 FEMININE ORDINAL INDICATOR => \*(Aqa\*(Aq +\& 0x00AB 0 +\& 0x00AD SOFT HYPHEN => "" +\& 0x00AE 0 +\& 0x00AF [ 0x0020, 0x0304 ] MACRON => SPACE . COMBINING MACRON +\& 0x00B0 0 +\& ... +.Ve +.Sp +The fourth (index [3]) element (\f(CW$default\fR) in the list returned for this +format is 0. +.ie n .IP "\fR\fB""ar""\fR\fB\fR" 4 +.el .IP \fR\f(CBar\fR\fB\fR 4 +.IX Item "ar" +means that all the elements of the map array are either rational numbers or +the string \f(CW"NaN"\fR, meaning "Not a Number". A rational number is either an +integer, or two integers separated by a solidus (\f(CW"/"\fR). The second integer +represents the denominator of the division implied by the solidus, and is +actually always positive, so it is guaranteed not to be 0 and to not be +signed. When the element is a plain integer (without the +solidus), it may need to be adjusted to get the correct value by adding the +offset, just as other \f(CW"a"\fR properties. No adjustment is needed for +fractions, as the range is guaranteed to have just a single element, and so +the offset is always 0. +.Sp +If you want to convert the returned map to entirely scalar numbers, you +can use something like this: +.Sp +.Vb 4 +\& my ($invlist_ref, $invmap_ref, $format) = prop_invmap($property); +\& if ($format && $format eq "ar") { +\& map { $_ = eval $_ if $_ ne \*(AqNaN\*(Aq } @$map_ref; +\& } +.Ve +.Sp +Here's some entries from the output of the property "Nv", which has format +\&\f(CW"ar"\fR. +.Sp +.Vb 10 +\& @numerics_ranges @numerics_maps Note +\& 0x00 "NaN" +\& 0x30 0 DIGIT 0 .. DIGIT 9 +\& 0x3A "NaN" +\& 0xB2 2 SUPERSCRIPTs 2 and 3 +\& 0xB4 "NaN" +\& 0xB9 1 SUPERSCRIPT 1 +\& 0xBA "NaN" +\& 0xBC 1/4 VULGAR FRACTION 1/4 +\& 0xBD 1/2 VULGAR FRACTION 1/2 +\& 0xBE 3/4 VULGAR FRACTION 3/4 +\& 0xBF "NaN" +\& 0x660 0 ARABIC\-INDIC DIGIT ZERO .. NINE +\& 0x66A "NaN" +.Ve +.Sp +The fourth (index [3]) element (\f(CW$default\fR) in the list returned for this +format is \f(CW"NaN"\fR. +.ie n .IP "\fR\fB""n""\fR\fB\fR" 4 +.el .IP \fR\f(CBn\fR\fB\fR 4 +.IX Item "n" +means the Name property. All the elements of the map array are simple +scalars, but some of them contain special strings that require more work to +get the actual name. +.Sp +Entries such as: +.Sp +.Vb 1 +\& CJK UNIFIED IDEOGRAPH\-<code point> +.Ve +.Sp +mean that the name for the code point is "CJK UNIFIED IDEOGRAPH\-" +with the code point (expressed in hexadecimal) appended to it, like "CJK +UNIFIED IDEOGRAPH\-3403" (similarly for \f(CW\*(C`CJK\ COMPATIBILITY\ IDEOGRAPH\-<code\ point>\*(C'\fR). +.Sp +Also, entries like +.Sp +.Vb 1 +\& <hangul syllable> +.Ve +.Sp +means that the name is algorithmically calculated. This is easily done by +the function "charnames::viacode(code)" in charnames. +.Sp +Note that for control characters (\f(CW\*(C`Gc=cc\*(C'\fR), Unicode's data files have the +string "\f(CW\*(C`<control>\*(C'\fR", but the real name of each of these characters is the empty +string. This function returns that real name, the empty string. (There are +names for these characters, but they are considered aliases, not the Name +property name, and are contained in the \f(CW\*(C`Name_Alias\*(C'\fR property.) +.ie n .IP "\fR\fB""ad""\fR\fB\fR" 4 +.el .IP \fR\f(CBad\fR\fB\fR 4 +.IX Item "ad" +means the Decomposition_Mapping property. This property is like \f(CW"al"\fR +properties, except that one of the scalar elements is of the form: +.Sp +.Vb 1 +\& <hangul syllable> +.Ve +.Sp +This signifies that this entry should be replaced by the decompositions for +all the code points whose decomposition is algorithmically calculated. (All +of them are currently in one range and no others outside the range are likely +to ever be added to Unicode; the \f(CW"n"\fR format +has this same entry.) These can be generated via the function +\&\fBUnicode::Normalize::NFD()\fR. +.Sp +Note that the mapping is the one that is specified in the Unicode data files, +and to get the final decomposition, it may need to be applied recursively. +Unicode in fact discourages use of this property except internally in +implementations of the Unicode Normalization Algorithm. +.Sp +The fourth (index [3]) element (\f(CW$default\fR) in the list returned for this +format is 0. +.PP +Note that a format begins with the letter "a" if and only the property it is +for requires adjustments by adding the offsets in multi-element ranges. For +all these properties, an entry should be adjusted only if the map is a scalar +which is an integer. That is, it must match the regular expression: +.PP +.Vb 1 +\& / ^ \-? \ed+ $ /xa +.Ve +.PP +Further, the first element in a range never needs adjustment, as the +adjustment would be just adding 0. +.PP +A binary search such as that provided by "\fBsearch_invlist()\fR", can be used to +quickly find a code point in the inversion list, and hence its corresponding +mapping. +.PP +The final, fourth element (index [3], assigned to \f(CW$default\fR in the "block" +example) in the four element list returned by this function is used with the +\&\f(CW"a"\fR format types; it may also be useful for applications +that wish to convert the returned inversion map data structure into some +other, such as a hash. It gives the mapping that most code points map to +under the property. If you establish the convention that any code point not +explicitly listed in your data structure maps to this value, you can +potentially make your data structure much smaller. As you construct your data +structure from the one returned by this function, simply ignore those ranges +that map to this value. For example, to +convert to the data structure searchable by "\fBcharinrange()\fR", you can follow +this recipe for properties that don't require adjustments: +.PP +.Vb 2 +\& my ($list_ref, $map_ref, $format, $default) = prop_invmap($property); +\& my @range_list; +\& +\& # Look at each element in the list, but the \-2 is needed because we +\& # look at $i+1 in the loop, and the final element is guaranteed to map +\& # to $default by prop_invmap(), so we would skip it anyway. +\& for my $i (0 .. @$list_ref \- 2) { +\& next if $map_ref\->[$i] eq $default; +\& push @range_list, [ $list_ref\->[$i], +\& $list_ref\->[$i+1], +\& $map_ref\->[$i] +\& ]; +\& } +\& +\& print charinrange(\e@range_list, $code_point), "\en"; +.Ve +.PP +With this, \f(CWcharinrange()\fR will return \f(CW\*(C`undef\*(C'\fR if its input code point maps +to \f(CW$default\fR. You can avoid this by omitting the \f(CW\*(C`next\*(C'\fR statement, and adding +a line after the loop to handle the final element of the inversion map. +.PP +Similarly, this recipe can be used for properties that do require adjustments: +.PP +.Vb 2 +\& for my $i (0 .. @$list_ref \- 2) { +\& next if $map_ref\->[$i] eq $default; +\& +\& # prop_invmap() guarantees that if the mapping is to an array, the +\& # range has just one element, so no need to worry about adjustments. +\& if (ref $map_ref\->[$i]) { +\& push @range_list, +\& [ $list_ref\->[$i], $list_ref\->[$i], $map_ref\->[$i] ]; +\& } +\& else { # Otherwise each element is actually mapped to a separate +\& # value, so the range has to be split into single code point +\& # ranges. +\& +\& my $adjustment = 0; +\& +\& # For each code point that gets mapped to something... +\& for my $j ($list_ref\->[$i] .. $list_ref\->[$i+1] \-1 ) { +\& +\& # ... add a range consisting of just it mapping to the +\& # original plus the adjustment, which is incremented for the +\& # next time through the loop, as the offset increases by 1 +\& # for each element in the range +\& push @range_list, +\& [ $j, $j, $map_ref\->[$i] + $adjustment++ ]; +\& } +\& } +\& } +.Ve +.PP +Note that the inversion maps returned for the \f(CW\*(C`Case_Folding\*(C'\fR and +\&\f(CW\*(C`Simple_Case_Folding\*(C'\fR properties do not include the Turkic-locale mappings. +Use "\fBcasefold()\fR" for these. +.PP +\&\f(CW\*(C`prop_invmap\*(C'\fR does not know about any user-defined properties, and will +return \f(CW\*(C`undef\*(C'\fR if called with one of those. +.PP +The returned values for the Perl extension properties, such as \f(CW\*(C`Any\*(C'\fR and +\&\f(CW\*(C`Greek\*(C'\fR are somewhat misleading. The values are either \f(CW"Y"\fR or \f(CW\*(C`"N\*(C'\fR". +All Unicode properties are bipartite, so you can actually use the \f(CW"Y"\fR or +\&\f(CW\*(C`"N\*(C'\fR" in a Perl regular expression for these, like \f(CW\*(C`qr/\ep{ID_Start=Y/}\*(C'\fR or +\&\f(CW\*(C`qr/\ep{Upper=N/}\*(C'\fR. But the Perl extensions aren't specified this way, only +like \f(CW\*(C`/qr/\ep{Any}\*(C'\fR, \fIetc\fR. You can't actually use the \f(CW"Y"\fR and \f(CW\*(C`"N\*(C'\fR" in +them. +.PP +\fIGetting every available name\fR +.IX Subsection "Getting every available name" +.PP +Instead of reading the Unicode Database directly from files, as you were able +to do for a long time, you are encouraged to use the supplied functions. So, +instead of reading \f(CW\*(C`Name.pl\*(C'\fR directly, which changed formats in 5.32, and may +do so again without notice in the future or even disappear, you ought to use +"\fBprop_invmap()\fR" like this: +.PP +.Vb 10 +\& my (%name, %cp, %cps, $n); +\& # All codepoints +\& foreach my $cat (qw( Name Name_Alias )) { +\& my ($codepoints, $names, $format, $default) = prop_invmap($cat); +\& # $format => "n", $default => "" +\& foreach my $i (0 .. @$codepoints \- 2) { +\& my ($cp, $n) = ($codepoints\->[$i], $names\->[$i]); +\& # If $n is a ref, the same codepoint has multiple names +\& foreach my $name (ref $n ? @$n : $n) { +\& $name{$cp} //= $name; +\& $cp{$name} //= $cp; +\& } +\& } +\& } +\& # Named sequences +\& { my %ns = namedseq(); +\& foreach my $name (sort { $ns{$a} cmp $ns{$b} } keys %ns) { +\& $cp{$name} //= [ map { ord } split "" => $ns{$name} ]; +\& } +\& } +.Ve +.SS \fBsearch_invlist()\fP +.IX Subsection "search_invlist()" +.Vb 2 +\& use Unicode::UCD qw(prop_invmap prop_invlist); +\& use Unicode::UCD \*(Aqsearch_invlist\*(Aq; +\& +\& my @invlist = prop_invlist($property_name); +\& print $code_point, ((search_invlist(\e@invlist, $code_point) // \-1) % 2) +\& ? " isn\*(Aqt" +\& : " is", +\& " in $property_name\en"; +\& +\& my ($blocks_ranges_ref, $blocks_map_ref) = prop_invmap("Block"); +\& my $index = search_invlist($blocks_ranges_ref, $code_point); +\& print "$code_point is in block ", $blocks_map_ref\->[$index], "\en"; +.Ve +.PP +\&\f(CW\*(C`search_invlist\*(C'\fR is used to search an inversion list returned by +\&\f(CW\*(C`prop_invlist\*(C'\fR or \f(CW\*(C`prop_invmap\*(C'\fR for a particular "code point argument". +\&\f(CW\*(C`undef\*(C'\fR is returned if the code point is not found in the inversion list +(this happens only when it is not a legal "code point argument", or is less +than the list's first element). A warning is raised in the first instance. +.PP +Otherwise, it returns the index into the list of the range that contains the +code point.; that is, find \f(CW\*(C`i\*(C'\fR such that +.PP +.Vb 1 +\& list[i]<= code_point < list[i+1]. +.Ve +.PP +As explained in "\fBprop_invlist()\fR", whether a code point is in the list or not +depends on if the index is even (in) or odd (not in). And as explained in +"\fBprop_invmap()\fR", the index is used with the returned parallel array to find +the mapping. +.SS Unicode::UCD::UnicodeVersion +.IX Subsection "Unicode::UCD::UnicodeVersion" +This returns the version of the Unicode Character Database, in other words, the +version of the Unicode standard the database implements. The version is a +string of numbers delimited by dots (\f(CW\*(Aq.\*(Aq\fR). +.SS "\fBBlocks versus Scripts\fP" +.IX Subsection "Blocks versus Scripts" +The difference between a block and a script is that scripts are closer +to the linguistic notion of a set of code points required to represent +languages, while block is more of an artifact of the Unicode code point +numbering and separation into blocks of consecutive code points (so far the +size of a block is some multiple of 16, like 128 or 256). +.PP +For example the Latin \fBscript\fR is spread over several \fBblocks\fR, such +as \f(CW\*(C`Basic Latin\*(C'\fR, \f(CW\*(C`Latin 1 Supplement\*(C'\fR, \f(CW\*(C`Latin Extended\-A\*(C'\fR, and +\&\f(CW\*(C`Latin Extended\-B\*(C'\fR. On the other hand, the Latin script does not +contain all the characters of the \f(CW\*(C`Basic Latin\*(C'\fR block (also known as +ASCII): it includes only the letters, and not, for example, the digits +nor the punctuation. +.PP +For blocks see <http://www.unicode.org/Public/UNIDATA/Blocks.txt> +.PP +For scripts see UTR #24: <http://www.unicode.org/reports/tr24/> +.SS "\fBMatching Scripts and Blocks\fP" +.IX Subsection "Matching Scripts and Blocks" +Scripts are matched with the regular-expression construct +\&\f(CW\*(C`\ep{...}\*(C'\fR (e.g. \f(CW\*(C`\ep{Tibetan}\*(C'\fR matches characters of the Tibetan script), +while \f(CW\*(C`\ep{Blk=...}\*(C'\fR is used for blocks (e.g. \f(CW\*(C`\ep{Blk=Tibetan}\*(C'\fR matches +any of the 256 code points in the Tibetan block). +.SS "Old-style versus new-style block names" +.IX Subsection "Old-style versus new-style block names" +Unicode publishes the names of blocks in two different styles, though the two +are equivalent under Unicode's loose matching rules. +.PP +The original style uses blanks and hyphens in the block names (except for +\&\f(CW\*(C`No_Block\*(C'\fR), like so: +.PP +.Vb 1 +\& Miscellaneous Mathematical Symbols\-B +.Ve +.PP +The newer style replaces these with underscores, like this: +.PP +.Vb 1 +\& Miscellaneous_Mathematical_Symbols_B +.Ve +.PP +This newer style is consistent with the values of other Unicode properties. +To preserve backward compatibility, all the functions in Unicode::UCD that +return block names (except as noted) return the old-style ones. +"\fBprop_value_aliases()\fR" returns the new-style and can be used to convert from +old-style to new-style: +.PP +.Vb 1 +\& my $new_style = prop_values_aliases("block", $old_style); +.Ve +.PP +Perl also has single-form extensions that refer to blocks, \f(CW\*(C`In_Cyrillic\*(C'\fR, +meaning \f(CW\*(C`Block=Cyrillic\*(C'\fR. These have always been written in the new style. +.PP +To convert from new-style to old-style, follow this recipe: +.PP +.Vb 1 +\& $old_style = charblock((prop_invlist("block=$new_style"))[0]); +.Ve +.PP +(which finds the range of code points in the block using \f(CW\*(C`prop_invlist\*(C'\fR, +gets the lower end of the range (0th element) and then looks up the old name +for its block using \f(CW\*(C`charblock\*(C'\fR). +.PP +Note that starting in Unicode 6.1, many of the block names have shorter +synonyms. These are always given in the new style. +.SS "Use with older Unicode versions" +.IX Subsection "Use with older Unicode versions" +The functions in this module work as well as can be expected when +used on earlier Unicode versions. But, obviously, they use the available data +from that Unicode version. For example, if the Unicode version predates the +definition of the script property (Unicode 3.1), then any function that deals +with scripts is going to return \f(CW\*(C`undef\*(C'\fR for the script portion of the return +value. +.SH AUTHOR +.IX Header "AUTHOR" +Jarkko Hietaniemi. Now maintained by perl5 porters. |