summaryrefslogtreecommitdiffstats
path: root/upstream/archlinux/man3/Unicode::UCD.3perl
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-15 19:43:11 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-15 19:43:11 +0000
commitfc22b3d6507c6745911b9dfcc68f1e665ae13dbc (patch)
treece1e3bce06471410239a6f41282e328770aa404a /upstream/archlinux/man3/Unicode::UCD.3perl
parentInitial commit. (diff)
downloadmanpages-l10n-fc22b3d6507c6745911b9dfcc68f1e665ae13dbc.tar.xz
manpages-l10n-fc22b3d6507c6745911b9dfcc68f1e665ae13dbc.zip
Adding upstream version 4.22.0.upstream/4.22.0
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'upstream/archlinux/man3/Unicode::UCD.3perl')
-rw-r--r--upstream/archlinux/man3/Unicode::UCD.3perl1894
1 files changed, 1894 insertions, 0 deletions
diff --git a/upstream/archlinux/man3/Unicode::UCD.3perl b/upstream/archlinux/man3/Unicode::UCD.3perl
new file mode 100644
index 00000000..fa0484c7
--- /dev/null
+++ b/upstream/archlinux/man3/Unicode::UCD.3perl
@@ -0,0 +1,1894 @@
+.\" -*- mode: troff; coding: utf-8 -*-
+.\" Automatically generated by Pod::Man 5.01 (Pod::Simple 3.43)
+.\"
+.\" Standard preamble:
+.\" ========================================================================
+.de Sp \" Vertical space (when we can't use .PP)
+.if t .sp .5v
+.if n .sp
+..
+.de Vb \" Begin verbatim text
+.ft CW
+.nf
+.ne \\$1
+..
+.de Ve \" End verbatim text
+.ft R
+.fi
+..
+.\" \*(C` and \*(C' are quotes in nroff, nothing in troff, for use with C<>.
+.ie n \{\
+. ds C` ""
+. ds C' ""
+'br\}
+.el\{\
+. ds C`
+. ds C'
+'br\}
+.\"
+.\" Escape single quotes in literal strings from groff's Unicode transform.
+.ie \n(.g .ds Aq \(aq
+.el .ds Aq '
+.\"
+.\" If the F register is >0, we'll generate index entries on stderr for
+.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
+.\" entries marked with X<> in POD. Of course, you'll have to process the
+.\" output yourself in some meaningful fashion.
+.\"
+.\" Avoid warning from groff about undefined register 'F'.
+.de IX
+..
+.nr rF 0
+.if \n(.g .if rF .nr rF 1
+.if (\n(rF:(\n(.g==0)) \{\
+. if \nF \{\
+. de IX
+. tm Index:\\$1\t\\n%\t"\\$2"
+..
+. if !\nF==2 \{\
+. nr % 0
+. nr F 2
+. \}
+. \}
+.\}
+.rr rF
+.\" ========================================================================
+.\"
+.IX Title "Unicode::UCD 3perl"
+.TH Unicode::UCD 3perl 2024-02-11 "perl v5.38.2" "Perl Programmers Reference Guide"
+.\" For nroff, turn off justification. Always turn off hyphenation; it makes
+.\" way too many mistakes in technical documents.
+.if n .ad l
+.nh
+.SH NAME
+Unicode::UCD \- Unicode character database
+.SH SYNOPSIS
+.IX Header "SYNOPSIS"
+.Vb 2
+\& use Unicode::UCD \*(Aqcharinfo\*(Aq;
+\& my $charinfo = charinfo($codepoint);
+\&
+\& use Unicode::UCD \*(Aqcharprop\*(Aq;
+\& my $value = charprop($codepoint, $property);
+\&
+\& use Unicode::UCD \*(Aqcharprops_all\*(Aq;
+\& my $all_values_hash_ref = charprops_all($codepoint);
+\&
+\& use Unicode::UCD \*(Aqcasefold\*(Aq;
+\& my $casefold = casefold($codepoint);
+\&
+\& use Unicode::UCD \*(Aqall_casefolds\*(Aq;
+\& my $all_casefolds_ref = all_casefolds();
+\&
+\& use Unicode::UCD \*(Aqcasespec\*(Aq;
+\& my $casespec = casespec($codepoint);
+\&
+\& use Unicode::UCD \*(Aqcharblock\*(Aq;
+\& my $charblock = charblock($codepoint);
+\&
+\& use Unicode::UCD \*(Aqcharscript\*(Aq;
+\& my $charscript = charscript($codepoint);
+\&
+\& use Unicode::UCD \*(Aqcharblocks\*(Aq;
+\& my $charblocks = charblocks();
+\&
+\& use Unicode::UCD \*(Aqcharscripts\*(Aq;
+\& my $charscripts = charscripts();
+\&
+\& use Unicode::UCD qw(charscript charinrange);
+\& my $range = charscript($script);
+\& print "looks like $script\en" if charinrange($range, $codepoint);
+\&
+\& use Unicode::UCD qw(general_categories bidi_types);
+\& my $categories = general_categories();
+\& my $types = bidi_types();
+\&
+\& use Unicode::UCD \*(Aqprop_aliases\*(Aq;
+\& my @space_names = prop_aliases("space");
+\&
+\& use Unicode::UCD \*(Aqprop_value_aliases\*(Aq;
+\& my @gc_punct_names = prop_value_aliases("Gc", "Punct");
+\&
+\& use Unicode::UCD \*(Aqprop_values\*(Aq;
+\& my @all_EA_short_names = prop_values("East_Asian_Width");
+\&
+\& use Unicode::UCD \*(Aqprop_invlist\*(Aq;
+\& my @puncts = prop_invlist("gc=punctuation");
+\&
+\& use Unicode::UCD \*(Aqprop_invmap\*(Aq;
+\& my ($list_ref, $map_ref, $format, $missing)
+\& = prop_invmap("General Category");
+\&
+\& use Unicode::UCD \*(Aqsearch_invlist\*(Aq;
+\& my $index = search_invlist(\e@invlist, $code_point);
+\&
+\& # The following function should be used only internally in
+\& # implementations of the Unicode Normalization Algorithm, and there
+\& # are better choices than it.
+\& use Unicode::UCD \*(Aqcompexcl\*(Aq;
+\& my $compexcl = compexcl($codepoint);
+\&
+\& use Unicode::UCD \*(Aqnamedseq\*(Aq;
+\& my $namedseq = namedseq($named_sequence_name);
+\&
+\& my $unicode_version = Unicode::UCD::UnicodeVersion();
+\&
+\& my $convert_to_numeric =
+\& Unicode::UCD::num("\eN{RUMI DIGIT ONE}\eN{RUMI DIGIT TWO}");
+.Ve
+.SH DESCRIPTION
+.IX Header "DESCRIPTION"
+The Unicode::UCD module offers a series of functions that
+provide a simple interface to the Unicode
+Character Database.
+.SS "code point argument"
+.IX Subsection "code point argument"
+Some of the functions are called with a \fIcode point argument\fR, which is either
+a decimal or a hexadecimal scalar designating a code point in the platform's
+native character set (extended to Unicode), or a string containing \f(CW\*(C`U+\*(C'\fR
+followed by hexadecimals
+designating a Unicode code point. A leading 0 will force a hexadecimal
+interpretation, as will a hexadecimal digit that isn't a decimal digit.
+.PP
+Examples:
+.PP
+.Vb 6
+\& 223 # Decimal 223 in native character set
+\& 0223 # Hexadecimal 223, native (= 547 decimal)
+\& 0xDF # Hexadecimal DF, native (= 223 decimal)
+\& \*(Aq0xDF\*(Aq # String form of hexadecimal (= 223 decimal)
+\& \*(AqU+DF\*(Aq # Hexadecimal DF, in Unicode\*(Aqs character set
+\& (= LATIN SMALL LETTER SHARP S)
+.Ve
+.PP
+Note that the largest code point in Unicode is U+10FFFF.
+.SS \fBcharinfo()\fP
+.IX Subsection "charinfo()"
+.Vb 1
+\& use Unicode::UCD \*(Aqcharinfo\*(Aq;
+\&
+\& my $charinfo = charinfo(0x41);
+.Ve
+.PP
+This returns information about the input "code point argument"
+as a reference to a hash of fields as defined by the Unicode
+standard. If the "code point argument" is not assigned in the standard
+(i.e., has the general category \f(CW\*(C`Cn\*(C'\fR meaning \f(CW\*(C`Unassigned\*(C'\fR)
+or is a non-character (meaning it is guaranteed to never be assigned in
+the standard),
+\&\f(CW\*(C`undef\*(C'\fR is returned.
+.PP
+Fields that aren't applicable to the particular code point argument exist in the
+returned hash, and are empty.
+.PP
+For results that are less "raw" than this function returns, or to get the values for
+any property, not just the few covered by this function, use the
+"\fBcharprop()\fR" function.
+.PP
+The keys in the hash with the meanings of their values are:
+.IP \fBcode\fR 4
+.IX Item "code"
+the input native "code point argument" expressed in hexadecimal, with
+leading zeros
+added if necessary to make it contain at least four hexdigits
+.IP \fBname\fR 4
+.IX Item "name"
+name of \fIcode\fR, all IN UPPER CASE.
+Some control-type code points do not have names.
+This field will be empty for \f(CW\*(C`Surrogate\*(C'\fR and \f(CW\*(C`Private Use\*(C'\fR code points,
+and for the others without a name,
+it will contain a description enclosed in angle brackets, like
+\&\f(CW\*(C`<control>\*(C'\fR.
+.IP \fBcategory\fR 4
+.IX Item "category"
+The short name of the general category of \fIcode\fR.
+This will match one of the keys in the hash returned by "\fBgeneral_categories()\fR".
+.Sp
+The "\fBprop_value_aliases()\fR" function can be used to get all the synonyms
+of the category name.
+.IP \fBcombining\fR 4
+.IX Item "combining"
+the combining class number for \fIcode\fR used in the Canonical Ordering Algorithm.
+For Unicode 5.1, this is described in Section 3.11 \f(CW\*(C`Canonical Ordering Behavior\*(C'\fR
+available at
+<http://www.unicode.org/versions/Unicode5.1.0/>
+.Sp
+The "\fBprop_value_aliases()\fR" function can be used to get all the synonyms
+of the combining class number.
+.IP \fBbidi\fR 4
+.IX Item "bidi"
+bidirectional type of \fIcode\fR.
+This will match one of the keys in the hash returned by "\fBbidi_types()\fR".
+.Sp
+The "\fBprop_value_aliases()\fR" function can be used to get all the synonyms
+of the bidi type name.
+.IP \fBdecomposition\fR 4
+.IX Item "decomposition"
+is empty if \fIcode\fR has no decomposition; or is one or more codes
+(separated by spaces) that, taken in order, represent a decomposition for
+\&\fIcode\fR. Each has at least four hexdigits.
+The codes may be preceded by a word enclosed in angle brackets, then a space,
+like \f(CW\*(C`<compat> \*(C'\fR, giving the type of decomposition
+.Sp
+This decomposition may be an intermediate one whose components are also
+decomposable. Use Unicode::Normalize to get the final decomposition in one
+step.
+.IP \fBdecimal\fR 4
+.IX Item "decimal"
+if \fIcode\fR represents a decimal digit this is its integer numeric value
+.IP \fBdigit\fR 4
+.IX Item "digit"
+if \fIcode\fR represents some other digit-like number, this is its integer
+numeric value
+.IP \fBnumeric\fR 4
+.IX Item "numeric"
+if \fIcode\fR represents a whole or rational number, this is its numeric value.
+Rational values are expressed as a string like \f(CW\*(C`1/4\*(C'\fR.
+.IP \fBmirrored\fR 4
+.IX Item "mirrored"
+\&\f(CW\*(C`Y\*(C'\fR or \f(CW\*(C`N\*(C'\fR designating if \fIcode\fR is mirrored in bidirectional text
+.IP \fBunicode10\fR 4
+.IX Item "unicode10"
+name of \fIcode\fR in the Unicode 1.0 standard if one
+existed for this code point and is different from the current name
+.IP \fBcomment\fR 4
+.IX Item "comment"
+As of Unicode 6.0, this is always empty.
+.IP \fBupper\fR 4
+.IX Item "upper"
+is, if non-empty, the uppercase mapping for \fIcode\fR expressed as at least four
+hexdigits. This indicates that the full uppercase mapping is a single
+character, and is identical to the simple (single-character only) mapping.
+When this field is empty, it means that the simple uppercase mapping is
+\&\fIcode\fR itself; you'll need some other means, (like "\fBcharprop()\fR" or
+"\fBcasespec()\fR" to get the full mapping.
+.IP \fBlower\fR 4
+.IX Item "lower"
+is, if non-empty, the lowercase mapping for \fIcode\fR expressed as at least four
+hexdigits. This indicates that the full lowercase mapping is a single
+character, and is identical to the simple (single-character only) mapping.
+When this field is empty, it means that the simple lowercase mapping is
+\&\fIcode\fR itself; you'll need some other means, (like "\fBcharprop()\fR" or
+"\fBcasespec()\fR" to get the full mapping.
+.IP \fBtitle\fR 4
+.IX Item "title"
+is, if non-empty, the titlecase mapping for \fIcode\fR expressed as at least four
+hexdigits. This indicates that the full titlecase mapping is a single
+character, and is identical to the simple (single-character only) mapping.
+When this field is empty, it means that the simple titlecase mapping is
+\&\fIcode\fR itself; you'll need some other means, (like "\fBcharprop()\fR" or
+"\fBcasespec()\fR" to get the full mapping.
+.IP \fBblock\fR 4
+.IX Item "block"
+the block \fIcode\fR belongs to (used in \f(CW\*(C`\ep{Blk=...}\*(C'\fR).
+The "\fBprop_value_aliases()\fR" function can be used to get all the synonyms
+of the block name.
+.Sp
+See "Blocks versus Scripts".
+.IP \fBscript\fR 4
+.IX Item "script"
+the script \fIcode\fR belongs to.
+The "\fBprop_value_aliases()\fR" function can be used to get all the synonyms
+of the script name. Note that this is the older "Script" property value, and
+not the improved "Script_Extensions" value.
+.Sp
+See "Blocks versus Scripts".
+.PP
+Note that you cannot do (de)composition and casing based solely on the
+\&\fIdecomposition\fR, \fIcombining\fR, \fIlower\fR, \fIupper\fR, and \fItitle\fR fields; you
+will need also the "\fBcasespec()\fR" function and the \f(CW\*(C`Composition_Exclusion\*(C'\fR
+property. (Or you could just use the \fBlc()\fR,
+\&\fBuc()\fR, and \fBucfirst()\fR functions, and the
+Unicode::Normalize module.)
+.SS \fBcharprop()\fP
+.IX Subsection "charprop()"
+.Vb 1
+\& use Unicode::UCD \*(Aqcharprop\*(Aq;
+\&
+\& print charprop(0x41, "Gc"), "\en";
+\& print charprop(0x61, "General_Category"), "\en";
+\&
+\& prints
+\& Lu
+\& Ll
+.Ve
+.PP
+This returns the value of the Unicode property given by the second parameter
+for the "code point argument" given by the first.
+.PP
+The passed-in property may be specified as any of the synonyms returned by
+"\fBprop_aliases()\fR".
+.PP
+The return value is always a scalar, either a string or a number. For
+properties where there are synonyms for the values, the synonym returned by
+this function is the longest, most descriptive form, the one returned by
+"\fBprop_value_aliases()\fR" when called in a scalar context. Of course, you can
+call "\fBprop_value_aliases()\fR" on the result to get other synonyms.
+.PP
+The return values are more "cooked" than the "\fBcharinfo()\fR" ones. For
+example, the \f(CW"uc"\fR property value is the actual string containing the full
+uppercase mapping of the input code point. You have to go to extra trouble
+with \f(CW\*(C`charinfo\*(C'\fR to get this value from its \f(CW\*(C`upper\*(C'\fR hash element when the
+full mapping differs from the simple one.
+.PP
+Special note should be made of the return values for a few properties:
+.IP Block 4
+.IX Item "Block"
+The value returned is the new-style (see "Old-style versus new-style block
+names").
+.IP Decomposition_Mapping 4
+.IX Item "Decomposition_Mapping"
+Like "\fBcharinfo()\fR", the result may be an intermediate decomposition whose
+components are also decomposable. Use Unicode::Normalize to get the final
+decomposition in one step.
+.Sp
+Unlike "\fBcharinfo()\fR", this does not include the decomposition type. Use the
+\&\f(CW\*(C`Decomposition_Type\*(C'\fR property to get that.
+.IP Name_Alias 4
+.IX Item "Name_Alias"
+If the input code point's name has more than one synonym, they are returned
+joined into a single comma-separated string.
+.IP Numeric_Value 4
+.IX Item "Numeric_Value"
+If the result is a fraction, it is converted into a floating point number to
+the accuracy of your platform.
+.IP Script_Extensions 4
+.IX Item "Script_Extensions"
+If the result is multiple script names, they are returned joined into a single
+comma-separated string.
+.PP
+When called with a property that is a Perl extension that isn't expressible in
+a compound form, this function currently returns \f(CW\*(C`undef\*(C'\fR, as the only two
+possible values are \fItrue\fR or \fIfalse\fR (1 or 0 I suppose). This behavior may
+change in the future, so don't write code that relies on it. \f(CW\*(C`Present_In\*(C'\fR is
+a Perl extension that is expressible in a bipartite or compound form (for
+example, \f(CW\*(C`\ep{Present_In=4.0}\*(C'\fR), so \f(CW\*(C`charprop\*(C'\fR accepts it. But \f(CW\*(C`Any\*(C'\fR is a
+Perl extension that isn't expressible that way, so \f(CW\*(C`charprop\*(C'\fR returns
+\&\f(CW\*(C`undef\*(C'\fR for it. Also \f(CW\*(C`charprop\*(C'\fR returns \f(CW\*(C`undef\*(C'\fR for all Perl extensions
+that are internal-only.
+.SS \fBcharprops_all()\fP
+.IX Subsection "charprops_all()"
+.Vb 1
+\& use Unicode::UCD \*(Aqcharprops_all\*(Aq;
+\&
+\& my $%properties_of_A_hash_ref = charprops_all("U+41");
+.Ve
+.PP
+This returns a reference to a hash whose keys are all the distinct Unicode (no
+Perl extension) properties, and whose values are the respective values for
+those properties for the input "code point argument".
+.PP
+Each key is the property name in its longest, most descriptive form. The
+values are what "\fBcharprop()\fR" would return.
+.PP
+This function is expensive in time and memory.
+.SS \fBcharblock()\fP
+.IX Subsection "charblock()"
+.Vb 1
+\& use Unicode::UCD \*(Aqcharblock\*(Aq;
+\&
+\& my $charblock = charblock(0x41);
+\& my $charblock = charblock(1234);
+\& my $charblock = charblock(0x263a);
+\& my $charblock = charblock("U+263a");
+\&
+\& my $range = charblock(\*(AqArmenian\*(Aq);
+.Ve
+.PP
+With a "code point argument" \f(CWcharblock()\fR returns the \fIblock\fR the code point
+belongs to, e.g. \f(CW\*(C`Basic Latin\*(C'\fR. The old-style block name is returned (see
+"Old-style versus new-style block names").
+The "\fBprop_value_aliases()\fR" function can be used to get all the synonyms
+of the block name.
+.PP
+If the code point is unassigned, this returns the block it would belong to if
+it were assigned. (If the Unicode version being used is so early as to not
+have blocks, all code points are considered to be in \f(CW\*(C`No_Block\*(C'\fR.)
+.PP
+See also "Blocks versus Scripts".
+.PP
+If supplied with an argument that can't be a code point, \f(CWcharblock()\fR tries to
+do the opposite and interpret the argument as an old-style block name. On an
+ASCII platform, the return value is a \fIrange set\fR with one range: an
+anonymous array with a single element that consists of another anonymous array
+whose first element is the first code point in the block, and whose second
+element is the final code point in the block. On an EBCDIC
+platform, the first two Unicode blocks are not contiguous. Their range sets
+are lists containing \fIstart-of-range\fR, \fIend-of-range\fR code point pairs. You
+can test whether a code point is in a range set using the "\fBcharinrange()\fR"
+function. (To be precise, each \fIrange set\fR contains a third array element,
+after the range boundary ones: the old_style block name.)
+.PP
+If the argument to \f(CWcharblock()\fR is not a known block, \f(CW\*(C`undef\*(C'\fR is
+returned.
+.SS \fBcharscript()\fP
+.IX Subsection "charscript()"
+.Vb 1
+\& use Unicode::UCD \*(Aqcharscript\*(Aq;
+\&
+\& my $charscript = charscript(0x41);
+\& my $charscript = charscript(1234);
+\& my $charscript = charscript("U+263a");
+\&
+\& my $range = charscript(\*(AqThai\*(Aq);
+.Ve
+.PP
+With a "code point argument", \f(CWcharscript()\fR returns the \fIscript\fR the
+code point belongs to, e.g., \f(CW\*(C`Latin\*(C'\fR, \f(CW\*(C`Greek\*(C'\fR, \f(CW\*(C`Han\*(C'\fR.
+If the code point is unassigned or the Unicode version being used is so early
+that it doesn't have scripts, this function returns \f(CW"Unknown"\fR.
+The "\fBprop_value_aliases()\fR" function can be used to get all the synonyms
+of the script name.
+.PP
+Note that the Script_Extensions property is an improved version of the Script
+property, and you should probably be using that instead, with the
+"\fBcharprop()\fR" function.
+.PP
+If supplied with an argument that can't be a code point, \fBcharscript()\fR tries
+to do the opposite and interpret the argument as a script name. The
+return value is a \fIrange set\fR: an anonymous array of arrays that contain
+\&\fIstart-of-range\fR, \fIend-of-range\fR code point pairs. You can test whether a
+code point is in a range set using the "\fBcharinrange()\fR" function.
+(To be precise, each \fIrange set\fR contains a third array element,
+after the range boundary ones: the script name.)
+.PP
+If the \f(CWcharscript()\fR argument is not a known script, \f(CW\*(C`undef\*(C'\fR is returned.
+.PP
+See also "Blocks versus Scripts".
+.SS \fBcharblocks()\fP
+.IX Subsection "charblocks()"
+.Vb 1
+\& use Unicode::UCD \*(Aqcharblocks\*(Aq;
+\&
+\& my $charblocks = charblocks();
+.Ve
+.PP
+\&\f(CWcharblocks()\fR returns a reference to a hash with the known block names
+as the keys, and the code point ranges (see "\fBcharblock()\fR") as the values.
+.PP
+The names are in the old-style (see "Old-style versus new-style block
+names").
+.PP
+prop_invmap("block") can be used to get this same data in a
+different type of data structure.
+.PP
+prop_values("Block") can be used to get all
+the known new-style block names as a list, without the code point ranges.
+.PP
+See also "Blocks versus Scripts".
+.SS \fBcharscripts()\fP
+.IX Subsection "charscripts()"
+.Vb 1
+\& use Unicode::UCD \*(Aqcharscripts\*(Aq;
+\&
+\& my $charscripts = charscripts();
+.Ve
+.PP
+\&\f(CWcharscripts()\fR returns a reference to a hash with the known script
+names as the keys, and the code point ranges (see "\fBcharscript()\fR") as
+the values.
+.PP
+prop_invmap("script") can be used to get this same data in a
+different type of data structure. Since the Script_Extensions property is an
+improved version of the Script property, you should instead use
+prop_invmap("scx").
+.PP
+\&\f(CWprop_values("Script")\fR can be used to get all
+the known script names as a list, without the code point ranges.
+.PP
+See also "Blocks versus Scripts".
+.SS \fBcharinrange()\fP
+.IX Subsection "charinrange()"
+In addition to using the \f(CW\*(C`\ep{Blk=...}\*(C'\fR and \f(CW\*(C`\eP{Blk=...}\*(C'\fR constructs, you
+can also test whether a code point is in the \fIrange\fR as returned by
+"\fBcharblock()\fR" and "\fBcharscript()\fR" or as the values of the hash returned
+by "\fBcharblocks()\fR" and "\fBcharscripts()\fR" by using \f(CWcharinrange()\fR:
+.PP
+.Vb 1
+\& use Unicode::UCD qw(charscript charinrange);
+\&
+\& $range = charscript(\*(AqHiragana\*(Aq);
+\& print "looks like hiragana\en" if charinrange($range, $codepoint);
+.Ve
+.SS \fBgeneral_categories()\fP
+.IX Subsection "general_categories()"
+.Vb 1
+\& use Unicode::UCD \*(Aqgeneral_categories\*(Aq;
+\&
+\& my $categories = general_categories();
+.Ve
+.PP
+This returns a reference to a hash which has short
+general category names (such as \f(CW\*(C`Lu\*(C'\fR, \f(CW\*(C`Nd\*(C'\fR, \f(CW\*(C`Zs\*(C'\fR, \f(CW\*(C`S\*(C'\fR) as keys and long
+names (such as \f(CW\*(C`UppercaseLetter\*(C'\fR, \f(CW\*(C`DecimalNumber\*(C'\fR, \f(CW\*(C`SpaceSeparator\*(C'\fR,
+\&\f(CW\*(C`Symbol\*(C'\fR) as values. The hash is reversible in case you need to go
+from the long names to the short names. The general category is the
+one returned from
+"\fBcharinfo()\fR" under the \f(CW\*(C`category\*(C'\fR key.
+.PP
+The "\fBprop_values()\fR" and "\fBprop_value_aliases()\fR" functions can be used as an
+alternative to this function; the first returning a simple list of the short
+category names; and the second gets all the synonyms of a given category name.
+.SS \fBbidi_types()\fP
+.IX Subsection "bidi_types()"
+.Vb 1
+\& use Unicode::UCD \*(Aqbidi_types\*(Aq;
+\&
+\& my $categories = bidi_types();
+.Ve
+.PP
+This returns a reference to a hash which has the short
+bidi (bidirectional) type names (such as \f(CW\*(C`L\*(C'\fR, \f(CW\*(C`R\*(C'\fR) as keys and long
+names (such as \f(CW\*(C`Left\-to\-Right\*(C'\fR, \f(CW\*(C`Right\-to\-Left\*(C'\fR) as values. The
+hash is reversible in case you need to go from the long names to the
+short names. The bidi type is the one returned from
+"\fBcharinfo()\fR"
+under the \f(CW\*(C`bidi\*(C'\fR key. For the exact meaning of the various bidi classes
+the Unicode TR9 is recommended reading:
+<http://www.unicode.org/reports/tr9/>
+(as of Unicode 5.0.0)
+.PP
+The "\fBprop_values()\fR" and "\fBprop_value_aliases()\fR" functions can be used as an
+alternative to this function; the first returning a simple list of the short
+bidi type names; and the second gets all the synonyms of a given bidi type
+name.
+.SS \fBcompexcl()\fP
+.IX Subsection "compexcl()"
+WARNING: Unicode discourages the use of this function or any of the
+alternative mechanisms listed in this section (the documentation of
+\&\f(CWcompexcl()\fR), except internally in implementations of the Unicode
+Normalization Algorithm. You should be using Unicode::Normalize directly
+instead of these. Using these will likely lead to half-baked results.
+.PP
+.Vb 1
+\& use Unicode::UCD \*(Aqcompexcl\*(Aq;
+\&
+\& my $compexcl = compexcl(0x09dc);
+.Ve
+.PP
+This routine returns \f(CW\*(C`undef\*(C'\fR if the Unicode version being used is so early
+that it doesn't have this property.
+.PP
+\&\f(CWcompexcl()\fR is included for backwards
+compatibility, but as of Perl 5.12 and more modern Unicode versions, for
+most purposes it is probably more convenient to use one of the following
+instead:
+.PP
+.Vb 2
+\& my $compexcl = chr(0x09dc) =~ /\ep{Comp_Ex};
+\& my $compexcl = chr(0x09dc) =~ /\ep{Full_Composition_Exclusion};
+.Ve
+.PP
+or even
+.PP
+.Vb 2
+\& my $compexcl = chr(0x09dc) =~ /\ep{CE};
+\& my $compexcl = chr(0x09dc) =~ /\ep{Composition_Exclusion};
+.Ve
+.PP
+The first two forms return \fBtrue\fR if the "code point argument" should not
+be produced by composition normalization. For the final two forms to return
+\&\fBtrue\fR, it is additionally required that this fact not otherwise be
+determinable from the Unicode data base.
+.PP
+This routine behaves identically to the final two forms. That is,
+it does not return \fBtrue\fR if the code point has a decomposition
+consisting of another single code point, nor if its decomposition starts
+with a code point whose combining class is non-zero. Code points that meet
+either of these conditions should also not be produced by composition
+normalization, which is probably why you should use the
+\&\f(CW\*(C`Full_Composition_Exclusion\*(C'\fR property instead, as shown above.
+.PP
+The routine returns \fBfalse\fR otherwise.
+.SS \fBcasefold()\fP
+.IX Subsection "casefold()"
+.Vb 1
+\& use Unicode::UCD \*(Aqcasefold\*(Aq;
+\&
+\& my $casefold = casefold(0xDF);
+\& if (defined $casefold) {
+\& my @full_fold_hex = split / /, $casefold\->{\*(Aqfull\*(Aq};
+\& my $full_fold_string =
+\& join "", map {chr(hex($_))} @full_fold_hex;
+\& my @turkic_fold_hex =
+\& split / /, ($casefold\->{\*(Aqturkic\*(Aq} ne "")
+\& ? $casefold\->{\*(Aqturkic\*(Aq}
+\& : $casefold\->{\*(Aqfull\*(Aq};
+\& my $turkic_fold_string =
+\& join "", map {chr(hex($_))} @turkic_fold_hex;
+\& }
+\& if (defined $casefold && $casefold\->{\*(Aqsimple\*(Aq} ne "") {
+\& my $simple_fold_hex = $casefold\->{\*(Aqsimple\*(Aq};
+\& my $simple_fold_string = chr(hex($simple_fold_hex));
+\& }
+.Ve
+.PP
+This returns the (almost) locale-independent case folding of the
+character specified by the "code point argument". (Starting in Perl v5.16,
+the core function \f(CWfc()\fR returns the \f(CW\*(C`full\*(C'\fR mapping (described below)
+faster than this does, and for entire strings.)
+.PP
+If there is no case folding for the input code point, \f(CW\*(C`undef\*(C'\fR is returned.
+.PP
+If there is a case folding for that code point, a reference to a hash
+with the following fields is returned:
+.IP \fBcode\fR 4
+.IX Item "code"
+the input native "code point argument" expressed in hexadecimal, with
+leading zeros
+added if necessary to make it contain at least four hexdigits
+.IP \fBfull\fR 4
+.IX Item "full"
+one or more codes (separated by spaces) that, taken in order, give the
+code points for the case folding for \fIcode\fR.
+Each has at least four hexdigits.
+.IP \fBsimple\fR 4
+.IX Item "simple"
+is empty, or is exactly one code with at least four hexdigits which can be used
+as an alternative case folding when the calling program cannot cope with the
+fold being a sequence of multiple code points. If \fIfull\fR is just one code
+point, then \fIsimple\fR equals \fIfull\fR. If there is no single code point folding
+defined for \fIcode\fR, then \fIsimple\fR is the empty string. Otherwise, it is an
+inferior, but still better-than-nothing alternative folding to \fIfull\fR.
+.IP \fBmapping\fR 4
+.IX Item "mapping"
+is the same as \fIsimple\fR if \fIsimple\fR is not empty, and it is the same as \fIfull\fR
+otherwise. It can be considered to be the simplest possible folding for
+\&\fIcode\fR. It is defined primarily for backwards compatibility.
+.IP \fBstatus\fR 4
+.IX Item "status"
+is \f(CW\*(C`C\*(C'\fR (for \f(CW\*(C`common\*(C'\fR) if the best possible fold is a single code point
+(\fIsimple\fR equals \fIfull\fR equals \fImapping\fR). It is \f(CW\*(C`S\*(C'\fR if there are distinct
+folds, \fIsimple\fR and \fIfull\fR (\fImapping\fR equals \fIsimple\fR). And it is \f(CW\*(C`F\*(C'\fR if
+there is only a \fIfull\fR fold (\fImapping\fR equals \fIfull\fR; \fIsimple\fR is empty).
+Note that this
+describes the contents of \fImapping\fR. It is defined primarily for backwards
+compatibility.
+.Sp
+For Unicode versions between 3.1 and 3.1.1 inclusive, \fIstatus\fR can also be
+\&\f(CW\*(C`I\*(C'\fR which is the same as \f(CW\*(C`C\*(C'\fR but is a special case for dotted uppercase I and
+dotless lowercase i:
+.RS 4
+.ie n .IP "\fB*\fR If you use this ""I"" mapping" 4
+.el .IP "\fB*\fR If you use this \f(CWI\fR mapping" 4
+.IX Item "* If you use this I mapping"
+the result is case-insensitive,
+but dotless and dotted I's are not distinguished
+.ie n .IP "\fB*\fR If you exclude this ""I"" mapping" 4
+.el .IP "\fB*\fR If you exclude this \f(CWI\fR mapping" 4
+.IX Item "* If you exclude this I mapping"
+the result is not fully case-insensitive, but
+dotless and dotted I's are distinguished
+.RE
+.RS 4
+.RE
+.IP \fBturkic\fR 4
+.IX Item "turkic"
+contains any special folding for Turkic languages. For versions of Unicode
+starting with 3.2, this field is empty unless \fIcode\fR has a different folding
+in Turkic languages, in which case it is one or more codes (separated by
+spaces) that, taken in order, give the code points for the case folding for
+\&\fIcode\fR in those languages.
+Each code has at least four hexdigits.
+Note that this folding does not maintain canonical equivalence without
+additional processing.
+.Sp
+For Unicode versions between 3.1 and 3.1.1 inclusive, this field is empty unless
+there is a
+special folding for Turkic languages, in which case \fIstatus\fR is \f(CW\*(C`I\*(C'\fR, and
+\&\fImapping\fR, \fIfull\fR, \fIsimple\fR, and \fIturkic\fR are all equal.
+.PP
+Programs that want complete generality and the best folding results should use
+the folding contained in the \fIfull\fR field. But note that the fold for some
+code points will be a sequence of multiple code points.
+.PP
+Programs that can't cope with the fold mapping being multiple code points can
+use the folding contained in the \fIsimple\fR field, with the loss of some
+generality. In Unicode 5.1, about 7% of the defined foldings have no single
+code point folding.
+.PP
+The \fImapping\fR and \fIstatus\fR fields are provided for backwards compatibility for
+existing programs. They contain the same values as in previous versions of
+this function.
+.PP
+Locale is not completely independent. The \fIturkic\fR field contains results to
+use when the locale is a Turkic language.
+.PP
+For more information about case mappings see
+<http://www.unicode.org/reports/tr21>
+.SS \fBall_casefolds()\fP
+.IX Subsection "all_casefolds()"
+.Vb 1
+\& use Unicode::UCD \*(Aqall_casefolds\*(Aq;
+\&
+\& my $all_folds_ref = all_casefolds();
+\& foreach my $char_with_casefold (sort { $a <=> $b }
+\& keys %$all_folds_ref)
+\& {
+\& printf "%04X:", $char_with_casefold;
+\& my $casefold = $all_folds_ref\->{$char_with_casefold};
+\&
+\& # Get folds for $char_with_casefold
+\&
+\& my @full_fold_hex = split / /, $casefold\->{\*(Aqfull\*(Aq};
+\& my $full_fold_string =
+\& join "", map {chr(hex($_))} @full_fold_hex;
+\& print " full=", join " ", @full_fold_hex;
+\& my @turkic_fold_hex =
+\& split / /, ($casefold\->{\*(Aqturkic\*(Aq} ne "")
+\& ? $casefold\->{\*(Aqturkic\*(Aq}
+\& : $casefold\->{\*(Aqfull\*(Aq};
+\& my $turkic_fold_string =
+\& join "", map {chr(hex($_))} @turkic_fold_hex;
+\& print "; turkic=", join " ", @turkic_fold_hex;
+\& if (defined $casefold && $casefold\->{\*(Aqsimple\*(Aq} ne "") {
+\& my $simple_fold_hex = $casefold\->{\*(Aqsimple\*(Aq};
+\& my $simple_fold_string = chr(hex($simple_fold_hex));
+\& print "; simple=$simple_fold_hex";
+\& }
+\& print "\en";
+\& }
+.Ve
+.PP
+This returns all the case foldings in the current version of Unicode in the
+form of a reference to a hash. Each key to the hash is the decimal
+representation of a Unicode character that has a casefold to other than
+itself. The casefold of a semi-colon is itself, so it isn't in the hash;
+likewise for a lowercase "a", but there is an entry for a capital "A". The
+hash value for each key is another hash, identical to what is returned by
+"\fBcasefold()\fR" if called with that code point as its argument. So the value
+\&\f(CW\*(C`all_casefolds()\->{ord("A")}\*(Aq\*(C'\fR is equivalent to \f(CW\*(C`casefold(ord("A"))\*(C'\fR;
+.SS \fBcasespec()\fP
+.IX Subsection "casespec()"
+.Vb 1
+\& use Unicode::UCD \*(Aqcasespec\*(Aq;
+\&
+\& my $casespec = casespec(0xFB00);
+.Ve
+.PP
+This returns the potentially locale-dependent case mappings of the "code point
+argument". The mappings may be longer than a single code point (which the basic
+Unicode case mappings as returned by "\fBcharinfo()\fR" never are).
+.PP
+If there are no case mappings for the "code point argument", or if all three
+possible mappings (\fIlower\fR, \fItitle\fR and \fIupper\fR) result in single code
+points and are locale independent and unconditional, \f(CW\*(C`undef\*(C'\fR is returned
+(which means that the case mappings, if any, for the code point are those
+returned by "\fBcharinfo()\fR").
+.PP
+Otherwise, a reference to a hash giving the mappings (or a reference to a hash
+of such hashes, explained below) is returned with the following keys and their
+meanings:
+.PP
+The keys in the bottom layer hash with the meanings of their values are:
+.IP \fBcode\fR 4
+.IX Item "code"
+the input native "code point argument" expressed in hexadecimal, with
+leading zeros
+added if necessary to make it contain at least four hexdigits
+.IP \fBlower\fR 4
+.IX Item "lower"
+one or more codes (separated by spaces) that, taken in order, give the
+code points for the lower case of \fIcode\fR.
+Each has at least four hexdigits.
+.IP \fBtitle\fR 4
+.IX Item "title"
+one or more codes (separated by spaces) that, taken in order, give the
+code points for the title case of \fIcode\fR.
+Each has at least four hexdigits.
+.IP \fBupper\fR 4
+.IX Item "upper"
+one or more codes (separated by spaces) that, taken in order, give the
+code points for the upper case of \fIcode\fR.
+Each has at least four hexdigits.
+.IP \fBcondition\fR 4
+.IX Item "condition"
+the conditions for the mappings to be valid.
+If \f(CW\*(C`undef\*(C'\fR, the mappings are always valid.
+When defined, this field is a list of conditions,
+all of which must be true for the mappings to be valid.
+The list consists of one or more
+\&\fIlocales\fR (see below)
+and/or \fIcontexts\fR (explained in the next paragraph),
+separated by spaces.
+(Other than as used to separate elements, spaces are to be ignored.)
+Case distinctions in the condition list are not significant.
+Conditions preceded by "NON_" represent the negation of the condition.
+.Sp
+A \fIcontext\fR is one of those defined in the Unicode standard.
+For Unicode 5.1, they are defined in Section 3.13 \f(CW\*(C`Default Case Operations\*(C'\fR
+available at
+<http://www.unicode.org/versions/Unicode5.1.0/>.
+These are for context-sensitive casing.
+.PP
+The hash described above is returned for locale-independent casing, where
+at least one of the mappings has length longer than one. If \f(CW\*(C`undef\*(C'\fR is
+returned, the code point may have mappings, but if so, all are length one,
+and are returned by "\fBcharinfo()\fR".
+Note that when this function does return a value, it will be for the complete
+set of mappings for a code point, even those whose length is one.
+.PP
+If there are additional casing rules that apply only in certain locales,
+an additional key for each will be defined in the returned hash. Each such key
+will be its locale name, defined as a 2\-letter ISO 3166 country code, possibly
+followed by a "_" and a 2\-letter ISO language code (possibly followed by a "_"
+and a variant code). You can find the lists of all possible locales, see
+Locale::Country and Locale::Language.
+(In Unicode 6.0, the only locales returned by this function
+are \f(CW\*(C`lt\*(C'\fR, \f(CW\*(C`tr\*(C'\fR, and \f(CW\*(C`az\*(C'\fR.)
+.PP
+Each locale key is a reference to a hash that has the form above, and gives
+the casing rules for that particular locale, which take precedence over the
+locale-independent ones when in that locale.
+.PP
+If the only casing for a code point is locale-dependent, then the returned
+hash will not have any of the base keys, like \f(CW\*(C`code\*(C'\fR, \f(CW\*(C`upper\*(C'\fR, etc., but
+will contain only locale keys.
+.PP
+For more information about case mappings see
+<http://www.unicode.org/reports/tr21/>
+.SS \fBnamedseq()\fP
+.IX Subsection "namedseq()"
+.Vb 1
+\& use Unicode::UCD \*(Aqnamedseq\*(Aq;
+\&
+\& my $namedseq = namedseq("KATAKANA LETTER AINU P");
+\& my @namedseq = namedseq("KATAKANA LETTER AINU P");
+\& my %namedseq = namedseq();
+.Ve
+.PP
+If used with a single argument in a scalar context, returns the string
+consisting of the code points of the named sequence, or \f(CW\*(C`undef\*(C'\fR if no
+named sequence by that name exists. If used with a single argument in
+a list context, it returns the list of the ordinals of the code points.
+.PP
+If used with no
+arguments in a list context, it returns a hash with the names of all the
+named sequences as the keys and their sequences as strings as
+the values. Otherwise, it returns \f(CW\*(C`undef\*(C'\fR or an empty list depending
+on the context.
+.PP
+This function only operates on officially approved (not provisional) named
+sequences.
+.PP
+Note that as of Perl 5.14, \f(CW\*(C`\eN{KATAKANA LETTER AINU P}\*(C'\fR will insert the named
+sequence into double-quoted strings, and \f(CW\*(C`charnames::string_vianame("KATAKANA
+LETTER AINU P")\*(C'\fR will return the same string this function does, but will also
+operate on character names that aren't named sequences, without you having to
+know which are which. See charnames.
+.SS \fBnum()\fP
+.IX Subsection "num()"
+.Vb 1
+\& use Unicode::UCD \*(Aqnum\*(Aq;
+\&
+\& my $val = num("123");
+\& my $one_quarter = num("\eN{VULGAR FRACTION ONE QUARTER}");
+\& my $val = num("12a", \e$valid_length); # $valid_length contains 2
+.Ve
+.PP
+\&\f(CWnum()\fR returns the numeric value of the input Unicode string; or \f(CW\*(C`undef\*(C'\fR if it
+doesn't think the entire string has a completely valid, safe numeric value.
+If called with an optional second parameter, a reference to a scalar, \f(CWnum()\fR
+will set the scalar to the length of any valid initial substring; or to 0 if none.
+.PP
+If the string is just one character in length, the Unicode numeric value
+is returned if it has one, or \f(CW\*(C`undef\*(C'\fR otherwise. If the optional scalar ref
+is passed, it would be set to 1 if the return is valid; or 0 if the return is
+\&\f(CW\*(C`undef\*(C'\fR. Note that the numeric value returned need not be a whole number.
+\&\f(CW\*(C`num("\eN{TIBETAN DIGIT HALF ZERO}")\*(C'\fR, for example returns \-0.5.
+.PP
+If the string is more than one character, \f(CW\*(C`undef\*(C'\fR is returned unless
+all its characters are decimal digits (that is, they would match \f(CW\*(C`\ed+\*(C'\fR),
+from the same script. For example if you have an ASCII '0' and a Bengali
+\&'3', mixed together, they aren't considered a valid number, and \f(CW\*(C`undef\*(C'\fR
+is returned. A further restriction is that the digits all have to be of
+the same form. A half-width digit mixed with a full-width one will
+return \f(CW\*(C`undef\*(C'\fR. The Arabic script has two sets of digits; \f(CW\*(C`num\*(C'\fR will
+return \f(CW\*(C`undef\*(C'\fR unless all the digits in the string come from the same
+set. In all cases, the optional scalar ref parameter is set to how
+long any valid initial substring of digits is; hence it will be set to the
+entire string length if the main return value is not \f(CW\*(C`undef\*(C'\fR.
+.PP
+\&\f(CW\*(C`num\*(C'\fR errs on the side of safety, and there may be valid strings of
+decimal digits that it doesn't recognize. Note that Unicode defines
+a number of "digit" characters that aren't "decimal digit" characters.
+"Decimal digits" have the property that they have a positional value, i.e.,
+there is a units position, a 10's position, a 100's, etc, AND they are
+arranged in Unicode in blocks of 10 contiguous code points. The Chinese
+digits, for example, are not in such a contiguous block, and so Unicode
+doesn't view them as decimal digits, but merely digits, and so \f(CW\*(C`\ed\*(C'\fR will not
+match them. A single-character string containing one of these digits will
+have its decimal value returned by \f(CW\*(C`num\*(C'\fR, but any longer string containing
+only these digits will return \f(CW\*(C`undef\*(C'\fR.
+.PP
+Strings of multiple sub\- and superscripts are not recognized as numbers. You
+can use either of the compatibility decompositions in Unicode::Normalize to
+change these into digits, and then call \f(CW\*(C`num\*(C'\fR on the result.
+.SS \fBprop_aliases()\fP
+.IX Subsection "prop_aliases()"
+.Vb 1
+\& use Unicode::UCD \*(Aqprop_aliases\*(Aq;
+\&
+\& my ($short_name, $full_name, @other_names) = prop_aliases("space");
+\& my $same_full_name = prop_aliases("Space"); # Scalar context
+\& my ($same_short_name) = prop_aliases("Space"); # gets 0th element
+\& print "The full name is $full_name\en";
+\& print "The short name is $short_name\en";
+\& print "The other aliases are: ", join(", ", @other_names), "\en";
+\&
+\& prints:
+\& The full name is White_Space
+\& The short name is WSpace
+\& The other aliases are: Space
+.Ve
+.PP
+Most Unicode properties have several synonymous names. Typically, there is at
+least a short name, convenient to type, and a long name that more fully
+describes the property, and hence is more easily understood.
+.PP
+If you know one name for a Unicode property, you can use \f(CW\*(C`prop_aliases\*(C'\fR to find
+either the long name (when called in scalar context), or a list of all of the
+names, somewhat ordered so that the short name is in the 0th element, the long
+name in the next element, and any other synonyms are in the remaining
+elements, in no particular order.
+.PP
+The long name is returned in a form nicely capitalized, suitable for printing.
+.PP
+The input parameter name is loosely matched, which means that white space,
+hyphens, and underscores are ignored (except for the trailing underscore in
+the old_form grandfathered-in \f(CW"L_"\fR, which is better written as \f(CW"LC"\fR, and
+both of which mean \f(CW\*(C`General_Category=Cased Letter\*(C'\fR).
+.PP
+If the name is unknown, \f(CW\*(C`undef\*(C'\fR is returned (or an empty list in list
+context). Note that Perl typically recognizes property names in regular
+expressions with an optional \f(CW\*(C`"Is_\*(C'\fR" (with or without the underscore)
+prefixed to them, such as \f(CW\*(C`\ep{isgc=punct}\*(C'\fR. This function does not recognize
+those in the input, returning \f(CW\*(C`undef\*(C'\fR. Nor are they included in the output
+as possible synonyms.
+.PP
+\&\f(CW\*(C`prop_aliases\*(C'\fR does know about the Perl extensions to Unicode properties,
+such as \f(CW\*(C`Any\*(C'\fR and \f(CW\*(C`XPosixAlpha\*(C'\fR, and the single form equivalents to Unicode
+properties such as \f(CW\*(C`XDigit\*(C'\fR, \f(CW\*(C`Greek\*(C'\fR, \f(CW\*(C`In_Greek\*(C'\fR, and \f(CW\*(C`Is_Greek\*(C'\fR. The
+final example demonstrates that the \f(CW"Is_"\fR prefix is recognized for these
+extensions; it is needed to resolve ambiguities. For example,
+\&\f(CWprop_aliases(\*(Aqlc\*(Aq)\fR returns the list \f(CW\*(C`(lc, Lowercase_Mapping)\*(C'\fR, but
+\&\f(CWprop_aliases(\*(Aqislc\*(Aq)\fR returns \f(CW\*(C`(Is_LC, Cased_Letter)\*(C'\fR. This is
+because \f(CW\*(C`islc\*(C'\fR is a Perl extension which is short for
+\&\f(CW\*(C`General_Category=Cased Letter\*(C'\fR. The lists returned for the Perl extensions
+will not include the \f(CW"Is_"\fR prefix (whether or not the input had it) unless
+needed to resolve ambiguities, as shown in the \f(CW"islc"\fR example, where the
+returned list had one element containing \f(CW"Is_"\fR, and the other without.
+.PP
+It is also possible for the reverse to happen: \f(CWprop_aliases(\*(Aqisc\*(Aq)\fR returns
+the list \f(CW\*(C`(isc, ISO_Comment)\*(C'\fR; whereas \f(CWprop_aliases(\*(Aqc\*(Aq)\fR returns
+\&\f(CW\*(C`(C, Other)\*(C'\fR (the latter being a Perl extension meaning
+\&\f(CW\*(C`General_Category=Other\*(C'\fR.
+"Properties accessible through Unicode::UCD" in perluniprops lists the available
+forms, including which ones are discouraged from use.
+.PP
+Those discouraged forms are accepted as input to \f(CW\*(C`prop_aliases\*(C'\fR, but are not
+returned in the lists. \f(CWprop_aliases(\*(AqisL&\*(Aq)\fR and \f(CWprop_aliases(\*(AqisL_\*(Aq)\fR,
+which are old synonyms for \f(CW"Is_LC"\fR and should not be used in new code, are
+examples of this. These both return \f(CW\*(C`(Is_LC, Cased_Letter)\*(C'\fR. Thus this
+function allows you to take a discouraged form, and find its acceptable
+alternatives. The same goes with single-form Block property equivalences.
+Only the forms that begin with \f(CW"In_"\fR are not discouraged; if you pass
+\&\f(CW\*(C`prop_aliases\*(C'\fR a discouraged form, you will get back the equivalent ones that
+begin with \f(CW"In_"\fR. It will otherwise look like a new-style block name (see.
+"Old-style versus new-style block names").
+.PP
+\&\f(CW\*(C`prop_aliases\*(C'\fR does not know about any user-defined properties, and will
+return \f(CW\*(C`undef\*(C'\fR if called with one of those. Likewise for Perl internal
+properties, with the exception of "Perl_Decimal_Digit" which it does know
+about (and which is documented below in "\fBprop_invmap()\fR").
+.SS \fBprop_values()\fP
+.IX Subsection "prop_values()"
+.Vb 1
+\& use Unicode::UCD \*(Aqprop_values\*(Aq;
+\&
+\& print "AHex values are: ", join(", ", prop_values("AHex")),
+\& "\en";
+\& prints:
+\& AHex values are: N, Y
+.Ve
+.PP
+Some Unicode properties have a restricted set of legal values. For example,
+all binary properties are restricted to just \f(CW\*(C`true\*(C'\fR or \f(CW\*(C`false\*(C'\fR; and there
+are only a few dozen possible General Categories. Use \f(CW\*(C`prop_values\*(C'\fR
+to find out if a given property is one such, and if so, to get a list of the
+values:
+.PP
+.Vb 3
+\& print join ", ", prop_values("NFC_Quick_Check");
+\& prints:
+\& M, N, Y
+.Ve
+.PP
+If the property doesn't have such a restricted set, \f(CW\*(C`undef\*(C'\fR is returned.
+.PP
+There are usually several synonyms for each possible value. Use
+"\fBprop_value_aliases()\fR" to access those.
+.PP
+Case, white space, hyphens, and underscores are ignored in the input property
+name (except for the trailing underscore in the old-form grandfathered-in
+general category property value \f(CW"L_"\fR, which is better written as \f(CW"LC"\fR).
+.PP
+If the property name is unknown, \f(CW\*(C`undef\*(C'\fR is returned. Note that Perl typically
+recognizes property names in regular expressions with an optional \f(CW\*(C`"Is_\*(C'\fR"
+(with or without the underscore) prefixed to them, such as \f(CW\*(C`\ep{isgc=punct}\*(C'\fR.
+This function does not recognize those in the property parameter, returning
+\&\f(CW\*(C`undef\*(C'\fR.
+.PP
+For the block property, new-style block names are returned (see
+"Old-style versus new-style block names").
+.PP
+\&\f(CW\*(C`prop_values\*(C'\fR does not know about any user-defined properties, and
+will return \f(CW\*(C`undef\*(C'\fR if called with one of those.
+.SS \fBprop_value_aliases()\fP
+.IX Subsection "prop_value_aliases()"
+.Vb 1
+\& use Unicode::UCD \*(Aqprop_value_aliases\*(Aq;
+\&
+\& my ($short_name, $full_name, @other_names)
+\& = prop_value_aliases("Gc", "Punct");
+\& my $same_full_name = prop_value_aliases("Gc", "P"); # Scalar cntxt
+\& my ($same_short_name) = prop_value_aliases("Gc", "P"); # gets 0th
+\& # element
+\& print "The full name is $full_name\en";
+\& print "The short name is $short_name\en";
+\& print "The other aliases are: ", join(", ", @other_names), "\en";
+\&
+\& prints:
+\& The full name is Punctuation
+\& The short name is P
+\& The other aliases are: Punct
+.Ve
+.PP
+Some Unicode properties have a restricted set of legal values. For example,
+all binary properties are restricted to just \f(CW\*(C`true\*(C'\fR or \f(CW\*(C`false\*(C'\fR; and there
+are only a few dozen possible General Categories.
+.PP
+You can use "\fBprop_values()\fR" to find out if a given property is one which has
+a restricted set of values, and if so, what those values are. But usually
+each value actually has several synonyms. For example, in Unicode binary
+properties, \fItruth\fR can be represented by any of the strings "Y", "Yes", "T",
+or "True"; and the General Category "Punctuation" by that string, or "Punct",
+or simply "P".
+.PP
+Like property names, there is typically at least a short name for each such
+property-value, and a long name. If you know any name of the property-value
+(which you can get by "\fBprop_values()\fR", you can use \f(CW\*(C`prop_value_aliases\*(C'\fR()
+to get the long name (when called in scalar context), or a list of all the
+names, with the short name in the 0th element, the long name in the next
+element, and any other synonyms in the remaining elements, in no particular
+order, except that any all-numeric synonyms will be last.
+.PP
+The long name is returned in a form nicely capitalized, suitable for printing.
+.PP
+Case, white space, hyphens, and underscores are ignored in the input parameters
+(except for the trailing underscore in the old-form grandfathered-in general
+category property value \f(CW"L_"\fR, which is better written as \f(CW"LC"\fR).
+.PP
+If either name is unknown, \f(CW\*(C`undef\*(C'\fR is returned. Note that Perl typically
+recognizes property names in regular expressions with an optional \f(CW\*(C`"Is_\*(C'\fR"
+(with or without the underscore) prefixed to them, such as \f(CW\*(C`\ep{isgc=punct}\*(C'\fR.
+This function does not recognize those in the property parameter, returning
+\&\f(CW\*(C`undef\*(C'\fR.
+.PP
+If called with a property that doesn't have synonyms for its values, it
+returns the input value, possibly normalized with capitalization and
+underscores, but not necessarily checking that the input value is valid.
+.PP
+For the block property, new-style block names are returned (see
+"Old-style versus new-style block names").
+.PP
+To find the synonyms for single-forms, such as \f(CW\*(C`\ep{Any}\*(C'\fR, use
+"\fBprop_aliases()\fR" instead.
+.PP
+\&\f(CW\*(C`prop_value_aliases\*(C'\fR does not know about any user-defined properties, and
+will return \f(CW\*(C`undef\*(C'\fR if called with one of those.
+.SS \fBprop_invlist()\fP
+.IX Subsection "prop_invlist()"
+\&\f(CW\*(C`prop_invlist\*(C'\fR returns an inversion list (described below) that defines all the
+code points for the binary Unicode property (or "property=value" pair) given
+by the input parameter string:
+.PP
+.Vb 3
+\& use feature \*(Aqsay\*(Aq;
+\& use Unicode::UCD \*(Aqprop_invlist\*(Aq;
+\& say join ", ", prop_invlist("Any");
+\&
+\& prints:
+\& 0, 1114112
+.Ve
+.PP
+If the input is unknown \f(CW\*(C`undef\*(C'\fR is returned in scalar context; an empty-list
+in list context. If the input is known, the number of elements in
+the list is returned if called in scalar context.
+.PP
+perluniprops gives
+the list of properties that this function accepts, as well as all the possible
+forms for them (including with the optional "Is_" prefixes). (Except this
+function doesn't accept any Perl-internal properties, some of which are listed
+there.) This function uses the same loose or tighter matching rules for
+resolving the input property's name as is done for regular expressions. These
+are also specified in perluniprops. Examples of using the "property=value" form are:
+.PP
+.Vb 1
+\& say join ", ", prop_invlist("Script_Extensions=Shavian");
+\&
+\& prints:
+\& 66640, 66688
+\&
+\& say join ", ", prop_invlist("ASCII_Hex_Digit=No");
+\&
+\& prints:
+\& 0, 48, 58, 65, 71, 97, 103
+\&
+\& say join ", ", prop_invlist("ASCII_Hex_Digit=Yes");
+\&
+\& prints:
+\& 48, 58, 65, 71, 97, 103
+.Ve
+.PP
+Inversion lists are a compact way of specifying Unicode property-value
+definitions. The 0th item in the list is the lowest code point that has the
+property-value. The next item (item [1]) is the lowest code point beyond that
+one that does NOT have the property-value. And the next item beyond that
+([2]) is the lowest code point beyond that one that does have the
+property-value, and so on. Put another way, each element in the list gives
+the beginning of a range that has the property-value (for even numbered
+elements), or doesn't have the property-value (for odd numbered elements).
+The name for this data structure stems from the fact that each element in the
+list toggles (or inverts) whether the corresponding range is or isn't on the
+list.
+.PP
+In the final example above, the first ASCII Hex digit is code point 48, the
+character "0", and all code points from it through 57 (a "9") are ASCII hex
+digits. Code points 58 through 64 aren't, but 65 (an "A") through 70 (an "F")
+are, as are 97 ("a") through 102 ("f"). 103 starts a range of code points
+that aren't ASCII hex digits. That range extends to infinity, which on your
+computer can be found in the variable \f(CW$Unicode::UCD::MAX_CP\fR. (This
+variable is as close to infinity as Perl can get on your platform, and may be
+too high for some operations to work; you may wish to use a smaller number for
+your purposes.)
+.PP
+Note that the inversion lists returned by this function can possibly include
+non-Unicode code points, that is anything above 0x10FFFF. Unicode properties
+are not defined on such code points. You might wish to change the output to
+not include these. Simply add 0x110000 at the end of the non-empty returned
+list if it isn't already that value; and pop that value if it is; like:
+.PP
+.Vb 9
+\& my @list = prop_invlist("foo");
+\& if (@list) {
+\& if ($list[\-1] == 0x110000) {
+\& pop @list; # Defeat the turning on for above Unicode
+\& }
+\& else {
+\& push @list, 0x110000; # Turn off for above Unicode
+\& }
+\& }
+.Ve
+.PP
+It is a simple matter to expand out an inversion list to a full list of all
+code points that have the property-value:
+.PP
+.Vb 11
+\& my @invlist = prop_invlist($property_name);
+\& die "empty" unless @invlist;
+\& my @full_list;
+\& for (my $i = 0; $i < @invlist; $i += 2) {
+\& my $upper = ($i + 1) < @invlist
+\& ? $invlist[$i+1] \- 1 # In range
+\& : $Unicode::UCD::MAX_CP; # To infinity.
+\& for my $j ($invlist[$i] .. $upper) {
+\& push @full_list, $j;
+\& }
+\& }
+.Ve
+.PP
+\&\f(CW\*(C`prop_invlist\*(C'\fR does not know about any user-defined nor Perl internal-only
+properties, and will return \f(CW\*(C`undef\*(C'\fR if called with one of those.
+.PP
+The "\fBsearch_invlist()\fR" function is provided for finding a code point within
+an inversion list.
+.SS \fBprop_invmap()\fP
+.IX Subsection "prop_invmap()"
+.Vb 3
+\& use Unicode::UCD \*(Aqprop_invmap\*(Aq;
+\& my ($list_ref, $map_ref, $format, $default)
+\& = prop_invmap("General Category");
+.Ve
+.PP
+\&\f(CW\*(C`prop_invmap\*(C'\fR is used to get the complete mapping definition for a property,
+in the form of an inversion map. An inversion map consists of two parallel
+arrays. One is an ordered list of code points that mark range beginnings, and
+the other gives the value (or mapping) that all code points in the
+corresponding range have.
+.PP
+\&\f(CW\*(C`prop_invmap\*(C'\fR is called with the name of the desired property. The name is
+loosely matched, meaning that differences in case, white-space, hyphens, and
+underscores are not meaningful (except for the trailing underscore in the
+old-form grandfathered-in property \f(CW"L_"\fR, which is better written as \f(CW"LC"\fR,
+or even better, \f(CW"Gc=LC"\fR).
+.PP
+Many Unicode properties have more than one name (or alias). \f(CW\*(C`prop_invmap\*(C'\fR
+understands all of these, including Perl extensions to them. Ambiguities are
+resolved as described above for "\fBprop_aliases()\fR" (except if a property has
+both a complete mapping, and a binary \f(CW\*(C`Y\*(C'\fR/\f(CW\*(C`N\*(C'\fR mapping, then specifying the
+property name prefixed by \f(CW"is"\fR causes the binary one to be returned). The
+Perl internal property "Perl_Decimal_Digit, described below, is also accepted.
+An empty list is returned if the property name is unknown.
+See "Properties accessible through Unicode::UCD" in perluniprops for the
+properties acceptable as inputs to this function.
+.PP
+It is a fatal error to call this function except in list context.
+.PP
+In addition to the two arrays that form the inversion map, \f(CW\*(C`prop_invmap\*(C'\fR
+returns two other values; one is a scalar that gives some details as to the
+format of the entries of the map array; the other is a default value, useful
+in maps whose format name begins with the letter \f(CW"a"\fR, as described
+below in its subsection; and for specialized purposes, such as
+converting to another data structure, described at the end of this main
+section.
+.PP
+This means that \f(CW\*(C`prop_invmap\*(C'\fR returns a 4 element list. For example,
+.PP
+.Vb 2
+\& my ($blocks_ranges_ref, $blocks_maps_ref, $format, $default)
+\& = prop_invmap("Block");
+.Ve
+.PP
+In this call, the two arrays will be populated as shown below (for Unicode
+6.0):
+.PP
+.Vb 10
+\& Index @blocks_ranges @blocks_maps
+\& 0 0x0000 Basic Latin
+\& 1 0x0080 Latin\-1 Supplement
+\& 2 0x0100 Latin Extended\-A
+\& 3 0x0180 Latin Extended\-B
+\& 4 0x0250 IPA Extensions
+\& 5 0x02B0 Spacing Modifier Letters
+\& 6 0x0300 Combining Diacritical Marks
+\& 7 0x0370 Greek and Coptic
+\& 8 0x0400 Cyrillic
+\& ...
+\& 233 0x2B820 No_Block
+\& 234 0x2F800 CJK Compatibility Ideographs Supplement
+\& 235 0x2FA20 No_Block
+\& 236 0xE0000 Tags
+\& 237 0xE0080 No_Block
+\& 238 0xE0100 Variation Selectors Supplement
+\& 239 0xE01F0 No_Block
+\& 240 0xF0000 Supplementary Private Use Area\-A
+\& 241 0x100000 Supplementary Private Use Area\-B
+\& 242 0x110000 No_Block
+.Ve
+.PP
+The first line (with Index [0]) means that the value for code point 0 is "Basic
+Latin". The entry "0x0080" in the \f(CW@blocks_ranges\fR column in the second line
+means that the value from the first line, "Basic Latin", extends to all code
+points in the range from 0 up to but not including 0x0080, that is, through
+127. In other words, the code points from 0 to 127 are all in the "Basic
+Latin" block. Similarly, all code points in the range from 0x0080 up to (but
+not including) 0x0100 are in the block named "Latin\-1 Supplement", etc.
+(Notice that the return is the old-style block names; see "Old-style versus
+new-style block names").
+.PP
+The final line (with Index [242]) means that the value for all code points above
+the legal Unicode maximum code point have the value "No_Block", which is the
+term Unicode uses for a non-existing block.
+.PP
+The arrays completely specify the mappings for all possible code points.
+The final element in an inversion map returned by this function will always be
+for the range that consists of all the code points that aren't legal Unicode,
+but that are expressible on the platform. (That is, it starts with code point
+0x110000, the first code point above the legal Unicode maximum, and extends to
+infinity.) The value for that range will be the same that any typical
+unassigned code point has for the specified property. (Certain unassigned
+code points are not "typical"; for example the non-character code points, or
+those in blocks that are to be written right-to-left. The above-Unicode
+range's value is not based on these atypical code points.) It could be argued
+that, instead of treating these as unassigned Unicode code points, the value
+for this range should be \f(CW\*(C`undef\*(C'\fR. If you wish, you can change the returned
+arrays accordingly.
+.PP
+The maps for almost all properties are simple scalars that should be
+interpreted as-is.
+These values are those given in the Unicode-supplied data files, which may be
+inconsistent as to capitalization and as to which synonym for a property-value
+is given. The results may be normalized by using the "\fBprop_value_aliases()\fR"
+function.
+.PP
+There are exceptions to the simple scalar maps. Some properties have some
+elements in their map list that are themselves lists of scalars; and some
+special strings are returned that are not to be interpreted as-is. Element
+[2] (placed into \f(CW$format\fR in the example above) of the returned four element
+list tells you if the map has any of these special elements or not, as follows:
+.ie n .IP "\fR\fB""s""\fR\fB\fR" 4
+.el .IP \fR\f(CBs\fR\fB\fR 4
+.IX Item "s"
+means all the elements of the map array are simple scalars, with no special
+elements. Almost all properties are like this, like the \f(CW\*(C`block\*(C'\fR example
+above.
+.ie n .IP "\fR\fB""sl""\fR\fB\fR" 4
+.el .IP \fR\f(CBsl\fR\fB\fR 4
+.IX Item "sl"
+means that some of the map array elements have the form given by \f(CW"s"\fR, and
+the rest are lists of scalars. For example, here is a portion of the output
+of calling \f(CW\*(C`prop_invmap\*(C'\fR() with the "Script Extensions" property:
+.Sp
+.Vb 6
+\& @scripts_ranges @scripts_maps
+\& ...
+\& 0x0953 Devanagari
+\& 0x0964 [ Bengali, Devanagari, Gurumukhi, Oriya ]
+\& 0x0966 Devanagari
+\& 0x0970 Common
+.Ve
+.Sp
+Here, the code points 0x964 and 0x965 are both used in Bengali,
+Devanagari, Gurmukhi, and Oriya, but no other scripts.
+.Sp
+The Name_Alias property is also of this form. But each scalar consists of two
+components: 1) the name, and 2) the type of alias this is. They are
+separated by a colon and a space. In Unicode 6.1, there are several alias types:
+.RS 4
+.ie n .IP """correction""" 4
+.el .IP \f(CWcorrection\fR 4
+.IX Item "correction"
+indicates that the name is a corrected form for the
+original name (which remains valid) for the same code point.
+.ie n .IP """control""" 4
+.el .IP \f(CWcontrol\fR 4
+.IX Item "control"
+adds a new name for a control character.
+.ie n .IP """alternate""" 4
+.el .IP \f(CWalternate\fR 4
+.IX Item "alternate"
+is an alternate name for a character
+.ie n .IP """figment""" 4
+.el .IP \f(CWfigment\fR 4
+.IX Item "figment"
+is a name for a character that has been documented but was never in any
+actual standard.
+.ie n .IP """abbreviation""" 4
+.el .IP \f(CWabbreviation\fR 4
+.IX Item "abbreviation"
+is a common abbreviation for a character
+.RE
+.RS 4
+.Sp
+The lists are ordered (roughly) so the most preferred names come before less
+preferred ones.
+.Sp
+For example,
+.Sp
+.Vb 10
+\& @aliases_ranges @alias_maps
+\& ...
+\& 0x009E [ \*(AqPRIVACY MESSAGE: control\*(Aq, \*(AqPM: abbreviation\*(Aq ]
+\& 0x009F [ \*(AqAPPLICATION PROGRAM COMMAND: control\*(Aq,
+\& \*(AqAPC: abbreviation\*(Aq
+\& ]
+\& 0x00A0 \*(AqNBSP: abbreviation\*(Aq
+\& 0x00A1 ""
+\& 0x00AD \*(AqSHY: abbreviation\*(Aq
+\& 0x00AE ""
+\& 0x01A2 \*(AqLATIN CAPITAL LETTER GHA: correction\*(Aq
+\& 0x01A3 \*(AqLATIN SMALL LETTER GHA: correction\*(Aq
+\& 0x01A4 ""
+\& ...
+.Ve
+.Sp
+A map to the empty string means that there is no alias defined for the code
+point.
+.RE
+.ie n .IP "\fR\fB""a""\fR\fB\fR" 4
+.el .IP \fR\f(CBa\fR\fB\fR 4
+.IX Item "a"
+is like \f(CW"s"\fR in that all the map array elements are scalars, but here they are
+restricted to all being integers, and some have to be adjusted (hence the name
+\&\f(CW"a"\fR) to get the correct result. For example, in:
+.Sp
+.Vb 2
+\& my ($uppers_ranges_ref, $uppers_maps_ref, $format, $default)
+\& = prop_invmap("Simple_Uppercase_Mapping");
+.Ve
+.Sp
+the returned arrays look like this:
+.Sp
+.Vb 7
+\& @$uppers_ranges_ref @$uppers_maps_ref Note
+\& 0 0
+\& 97 65 \*(Aqa\*(Aq maps to \*(AqA\*(Aq, b => B ...
+\& 123 0
+\& 181 924 MICRO SIGN => Greek Cap MU
+\& 182 0
+\& ...
+.Ve
+.Sp
+and \f(CW$default\fR is 0.
+.Sp
+Let's start with the second line. It says that the uppercase of code point 97
+is 65; or \f(CWuc("a")\fR == "A". But the line is for the entire range of code
+points 97 through 122. To get the mapping for any code point in this range,
+you take the offset it has from the beginning code point of the range, and add
+that to the mapping for that first code point. So, the mapping for 122 ("z")
+is derived by taking the offset of 122 from 97 (=25) and adding that to 65,
+yielding 90 ("Z"). Likewise for everything in between.
+.Sp
+Requiring this simple adjustment allows the returned arrays to be
+significantly smaller than otherwise, up to a factor of 10, speeding up
+searching through them.
+.Sp
+Ranges that map to \f(CW$default\fR, \f(CW"0"\fR, behave somewhat differently. For
+these, each code point maps to itself. So, in the first line in the example,
+\&\f(CW\*(C`ord(uc(chr(0)))\*(C'\fR is 0, \f(CW\*(C`ord(uc(chr(1)))\*(C'\fR is 1, ..
+\&\f(CW\*(C`ord(uc(chr(96)))\*(C'\fR is 96.
+.ie n .IP "\fR\fB""al""\fR\fB\fR" 4
+.el .IP \fR\f(CBal\fR\fB\fR 4
+.IX Item "al"
+means that some of the map array elements have the form given by \f(CW"a"\fR, and
+the rest are ordered lists of code points.
+For example, in:
+.Sp
+.Vb 2
+\& my ($uppers_ranges_ref, $uppers_maps_ref, $format, $default)
+\& = prop_invmap("Uppercase_Mapping");
+.Ve
+.Sp
+the returned arrays look like this:
+.Sp
+.Vb 11
+\& @$uppers_ranges_ref @$uppers_maps_ref
+\& 0 0
+\& 97 65
+\& 123 0
+\& 181 924
+\& 182 0
+\& ...
+\& 0x0149 [ 0x02BC 0x004E ]
+\& 0x014A 0
+\& 0x014B 330
+\& ...
+.Ve
+.Sp
+This is the full Uppercase_Mapping property (as opposed to the
+Simple_Uppercase_Mapping given in the example for format \f(CW"a"\fR). The only
+difference between the two in the ranges shown is that the code point at
+0x0149 (LATIN SMALL LETTER N PRECEDED BY APOSTROPHE) maps to a string of two
+characters, 0x02BC (MODIFIER LETTER APOSTROPHE) followed by 0x004E (LATIN
+CAPITAL LETTER N).
+.Sp
+No adjustments are needed to entries that are references to arrays; each such
+entry will have exactly one element in its range, so the offset is always 0.
+.Sp
+The fourth (index [3]) element (\f(CW$default\fR) in the list returned for this
+format is 0.
+.ie n .IP "\fR\fB""ae""\fR\fB\fR" 4
+.el .IP \fR\f(CBae\fR\fB\fR 4
+.IX Item "ae"
+This is like \f(CW"a"\fR, but some elements are the empty string, and should not be
+adjusted.
+The one internal Perl property accessible by \f(CW\*(C`prop_invmap\*(C'\fR is of this type:
+"Perl_Decimal_Digit" returns an inversion map which gives the numeric values
+that are represented by the Unicode decimal digit characters. Characters that
+don't represent decimal digits map to the empty string, like so:
+.Sp
+.Vb 12
+\& @digits @values
+\& 0x0000 ""
+\& 0x0030 0
+\& 0x003A: ""
+\& 0x0660: 0
+\& 0x066A: ""
+\& 0x06F0: 0
+\& 0x06FA: ""
+\& 0x07C0: 0
+\& 0x07CA: ""
+\& 0x0966: 0
+\& ...
+.Ve
+.Sp
+This means that the code points from 0 to 0x2F do not represent decimal digits;
+the code point 0x30 (DIGIT ZERO) represents 0; code point 0x31, (DIGIT ONE),
+represents 0+1\-0 = 1; ... code point 0x39, (DIGIT NINE), represents 0+9\-0 = 9;
+\&... code points 0x3A through 0x65F do not represent decimal digits; 0x660
+(ARABIC-INDIC DIGIT ZERO), represents 0; ... 0x07C1 (NKO DIGIT ONE),
+represents 0+1\-0 = 1 ...
+.Sp
+The fourth (index [3]) element (\f(CW$default\fR) in the list returned for this
+format is the empty string.
+.ie n .IP "\fR\fB""ale""\fR\fB\fR" 4
+.el .IP \fR\f(CBale\fR\fB\fR 4
+.IX Item "ale"
+is a combination of the \f(CW"al"\fR type and the \f(CW"ae"\fR type. Some of
+the map array elements have the forms given by \f(CW"al"\fR, and
+the rest are the empty string. The property \f(CW\*(C`NFKC_Casefold\*(C'\fR has this form.
+An example slice is:
+.Sp
+.Vb 9
+\& @$ranges_ref @$maps_ref Note
+\& ...
+\& 0x00AA 97 FEMININE ORDINAL INDICATOR => \*(Aqa\*(Aq
+\& 0x00AB 0
+\& 0x00AD SOFT HYPHEN => ""
+\& 0x00AE 0
+\& 0x00AF [ 0x0020, 0x0304 ] MACRON => SPACE . COMBINING MACRON
+\& 0x00B0 0
+\& ...
+.Ve
+.Sp
+The fourth (index [3]) element (\f(CW$default\fR) in the list returned for this
+format is 0.
+.ie n .IP "\fR\fB""ar""\fR\fB\fR" 4
+.el .IP \fR\f(CBar\fR\fB\fR 4
+.IX Item "ar"
+means that all the elements of the map array are either rational numbers or
+the string \f(CW"NaN"\fR, meaning "Not a Number". A rational number is either an
+integer, or two integers separated by a solidus (\f(CW"/"\fR). The second integer
+represents the denominator of the division implied by the solidus, and is
+actually always positive, so it is guaranteed not to be 0 and to not be
+signed. When the element is a plain integer (without the
+solidus), it may need to be adjusted to get the correct value by adding the
+offset, just as other \f(CW"a"\fR properties. No adjustment is needed for
+fractions, as the range is guaranteed to have just a single element, and so
+the offset is always 0.
+.Sp
+If you want to convert the returned map to entirely scalar numbers, you
+can use something like this:
+.Sp
+.Vb 4
+\& my ($invlist_ref, $invmap_ref, $format) = prop_invmap($property);
+\& if ($format && $format eq "ar") {
+\& map { $_ = eval $_ if $_ ne \*(AqNaN\*(Aq } @$map_ref;
+\& }
+.Ve
+.Sp
+Here's some entries from the output of the property "Nv", which has format
+\&\f(CW"ar"\fR.
+.Sp
+.Vb 10
+\& @numerics_ranges @numerics_maps Note
+\& 0x00 "NaN"
+\& 0x30 0 DIGIT 0 .. DIGIT 9
+\& 0x3A "NaN"
+\& 0xB2 2 SUPERSCRIPTs 2 and 3
+\& 0xB4 "NaN"
+\& 0xB9 1 SUPERSCRIPT 1
+\& 0xBA "NaN"
+\& 0xBC 1/4 VULGAR FRACTION 1/4
+\& 0xBD 1/2 VULGAR FRACTION 1/2
+\& 0xBE 3/4 VULGAR FRACTION 3/4
+\& 0xBF "NaN"
+\& 0x660 0 ARABIC\-INDIC DIGIT ZERO .. NINE
+\& 0x66A "NaN"
+.Ve
+.Sp
+The fourth (index [3]) element (\f(CW$default\fR) in the list returned for this
+format is \f(CW"NaN"\fR.
+.ie n .IP "\fR\fB""n""\fR\fB\fR" 4
+.el .IP \fR\f(CBn\fR\fB\fR 4
+.IX Item "n"
+means the Name property. All the elements of the map array are simple
+scalars, but some of them contain special strings that require more work to
+get the actual name.
+.Sp
+Entries such as:
+.Sp
+.Vb 1
+\& CJK UNIFIED IDEOGRAPH\-<code point>
+.Ve
+.Sp
+mean that the name for the code point is "CJK UNIFIED IDEOGRAPH\-"
+with the code point (expressed in hexadecimal) appended to it, like "CJK
+UNIFIED IDEOGRAPH\-3403" (similarly for \f(CW\*(C`CJK\ COMPATIBILITY\ IDEOGRAPH\-<code\ point>\*(C'\fR).
+.Sp
+Also, entries like
+.Sp
+.Vb 1
+\& <hangul syllable>
+.Ve
+.Sp
+means that the name is algorithmically calculated. This is easily done by
+the function "charnames::viacode(code)" in charnames.
+.Sp
+Note that for control characters (\f(CW\*(C`Gc=cc\*(C'\fR), Unicode's data files have the
+string "\f(CW\*(C`<control>\*(C'\fR", but the real name of each of these characters is the empty
+string. This function returns that real name, the empty string. (There are
+names for these characters, but they are considered aliases, not the Name
+property name, and are contained in the \f(CW\*(C`Name_Alias\*(C'\fR property.)
+.ie n .IP "\fR\fB""ad""\fR\fB\fR" 4
+.el .IP \fR\f(CBad\fR\fB\fR 4
+.IX Item "ad"
+means the Decomposition_Mapping property. This property is like \f(CW"al"\fR
+properties, except that one of the scalar elements is of the form:
+.Sp
+.Vb 1
+\& <hangul syllable>
+.Ve
+.Sp
+This signifies that this entry should be replaced by the decompositions for
+all the code points whose decomposition is algorithmically calculated. (All
+of them are currently in one range and no others outside the range are likely
+to ever be added to Unicode; the \f(CW"n"\fR format
+has this same entry.) These can be generated via the function
+\&\fBUnicode::Normalize::NFD()\fR.
+.Sp
+Note that the mapping is the one that is specified in the Unicode data files,
+and to get the final decomposition, it may need to be applied recursively.
+Unicode in fact discourages use of this property except internally in
+implementations of the Unicode Normalization Algorithm.
+.Sp
+The fourth (index [3]) element (\f(CW$default\fR) in the list returned for this
+format is 0.
+.PP
+Note that a format begins with the letter "a" if and only the property it is
+for requires adjustments by adding the offsets in multi-element ranges. For
+all these properties, an entry should be adjusted only if the map is a scalar
+which is an integer. That is, it must match the regular expression:
+.PP
+.Vb 1
+\& / ^ \-? \ed+ $ /xa
+.Ve
+.PP
+Further, the first element in a range never needs adjustment, as the
+adjustment would be just adding 0.
+.PP
+A binary search such as that provided by "\fBsearch_invlist()\fR", can be used to
+quickly find a code point in the inversion list, and hence its corresponding
+mapping.
+.PP
+The final, fourth element (index [3], assigned to \f(CW$default\fR in the "block"
+example) in the four element list returned by this function is used with the
+\&\f(CW"a"\fR format types; it may also be useful for applications
+that wish to convert the returned inversion map data structure into some
+other, such as a hash. It gives the mapping that most code points map to
+under the property. If you establish the convention that any code point not
+explicitly listed in your data structure maps to this value, you can
+potentially make your data structure much smaller. As you construct your data
+structure from the one returned by this function, simply ignore those ranges
+that map to this value. For example, to
+convert to the data structure searchable by "\fBcharinrange()\fR", you can follow
+this recipe for properties that don't require adjustments:
+.PP
+.Vb 2
+\& my ($list_ref, $map_ref, $format, $default) = prop_invmap($property);
+\& my @range_list;
+\&
+\& # Look at each element in the list, but the \-2 is needed because we
+\& # look at $i+1 in the loop, and the final element is guaranteed to map
+\& # to $default by prop_invmap(), so we would skip it anyway.
+\& for my $i (0 .. @$list_ref \- 2) {
+\& next if $map_ref\->[$i] eq $default;
+\& push @range_list, [ $list_ref\->[$i],
+\& $list_ref\->[$i+1],
+\& $map_ref\->[$i]
+\& ];
+\& }
+\&
+\& print charinrange(\e@range_list, $code_point), "\en";
+.Ve
+.PP
+With this, \f(CWcharinrange()\fR will return \f(CW\*(C`undef\*(C'\fR if its input code point maps
+to \f(CW$default\fR. You can avoid this by omitting the \f(CW\*(C`next\*(C'\fR statement, and adding
+a line after the loop to handle the final element of the inversion map.
+.PP
+Similarly, this recipe can be used for properties that do require adjustments:
+.PP
+.Vb 2
+\& for my $i (0 .. @$list_ref \- 2) {
+\& next if $map_ref\->[$i] eq $default;
+\&
+\& # prop_invmap() guarantees that if the mapping is to an array, the
+\& # range has just one element, so no need to worry about adjustments.
+\& if (ref $map_ref\->[$i]) {
+\& push @range_list,
+\& [ $list_ref\->[$i], $list_ref\->[$i], $map_ref\->[$i] ];
+\& }
+\& else { # Otherwise each element is actually mapped to a separate
+\& # value, so the range has to be split into single code point
+\& # ranges.
+\&
+\& my $adjustment = 0;
+\&
+\& # For each code point that gets mapped to something...
+\& for my $j ($list_ref\->[$i] .. $list_ref\->[$i+1] \-1 ) {
+\&
+\& # ... add a range consisting of just it mapping to the
+\& # original plus the adjustment, which is incremented for the
+\& # next time through the loop, as the offset increases by 1
+\& # for each element in the range
+\& push @range_list,
+\& [ $j, $j, $map_ref\->[$i] + $adjustment++ ];
+\& }
+\& }
+\& }
+.Ve
+.PP
+Note that the inversion maps returned for the \f(CW\*(C`Case_Folding\*(C'\fR and
+\&\f(CW\*(C`Simple_Case_Folding\*(C'\fR properties do not include the Turkic-locale mappings.
+Use "\fBcasefold()\fR" for these.
+.PP
+\&\f(CW\*(C`prop_invmap\*(C'\fR does not know about any user-defined properties, and will
+return \f(CW\*(C`undef\*(C'\fR if called with one of those.
+.PP
+The returned values for the Perl extension properties, such as \f(CW\*(C`Any\*(C'\fR and
+\&\f(CW\*(C`Greek\*(C'\fR are somewhat misleading. The values are either \f(CW"Y"\fR or \f(CW\*(C`"N\*(C'\fR".
+All Unicode properties are bipartite, so you can actually use the \f(CW"Y"\fR or
+\&\f(CW\*(C`"N\*(C'\fR" in a Perl regular expression for these, like \f(CW\*(C`qr/\ep{ID_Start=Y/}\*(C'\fR or
+\&\f(CW\*(C`qr/\ep{Upper=N/}\*(C'\fR. But the Perl extensions aren't specified this way, only
+like \f(CW\*(C`/qr/\ep{Any}\*(C'\fR, \fIetc\fR. You can't actually use the \f(CW"Y"\fR and \f(CW\*(C`"N\*(C'\fR" in
+them.
+.PP
+\fIGetting every available name\fR
+.IX Subsection "Getting every available name"
+.PP
+Instead of reading the Unicode Database directly from files, as you were able
+to do for a long time, you are encouraged to use the supplied functions. So,
+instead of reading \f(CW\*(C`Name.pl\*(C'\fR directly, which changed formats in 5.32, and may
+do so again without notice in the future or even disappear, you ought to use
+"\fBprop_invmap()\fR" like this:
+.PP
+.Vb 10
+\& my (%name, %cp, %cps, $n);
+\& # All codepoints
+\& foreach my $cat (qw( Name Name_Alias )) {
+\& my ($codepoints, $names, $format, $default) = prop_invmap($cat);
+\& # $format => "n", $default => ""
+\& foreach my $i (0 .. @$codepoints \- 2) {
+\& my ($cp, $n) = ($codepoints\->[$i], $names\->[$i]);
+\& # If $n is a ref, the same codepoint has multiple names
+\& foreach my $name (ref $n ? @$n : $n) {
+\& $name{$cp} //= $name;
+\& $cp{$name} //= $cp;
+\& }
+\& }
+\& }
+\& # Named sequences
+\& { my %ns = namedseq();
+\& foreach my $name (sort { $ns{$a} cmp $ns{$b} } keys %ns) {
+\& $cp{$name} //= [ map { ord } split "" => $ns{$name} ];
+\& }
+\& }
+.Ve
+.SS \fBsearch_invlist()\fP
+.IX Subsection "search_invlist()"
+.Vb 2
+\& use Unicode::UCD qw(prop_invmap prop_invlist);
+\& use Unicode::UCD \*(Aqsearch_invlist\*(Aq;
+\&
+\& my @invlist = prop_invlist($property_name);
+\& print $code_point, ((search_invlist(\e@invlist, $code_point) // \-1) % 2)
+\& ? " isn\*(Aqt"
+\& : " is",
+\& " in $property_name\en";
+\&
+\& my ($blocks_ranges_ref, $blocks_map_ref) = prop_invmap("Block");
+\& my $index = search_invlist($blocks_ranges_ref, $code_point);
+\& print "$code_point is in block ", $blocks_map_ref\->[$index], "\en";
+.Ve
+.PP
+\&\f(CW\*(C`search_invlist\*(C'\fR is used to search an inversion list returned by
+\&\f(CW\*(C`prop_invlist\*(C'\fR or \f(CW\*(C`prop_invmap\*(C'\fR for a particular "code point argument".
+\&\f(CW\*(C`undef\*(C'\fR is returned if the code point is not found in the inversion list
+(this happens only when it is not a legal "code point argument", or is less
+than the list's first element). A warning is raised in the first instance.
+.PP
+Otherwise, it returns the index into the list of the range that contains the
+code point.; that is, find \f(CW\*(C`i\*(C'\fR such that
+.PP
+.Vb 1
+\& list[i]<= code_point < list[i+1].
+.Ve
+.PP
+As explained in "\fBprop_invlist()\fR", whether a code point is in the list or not
+depends on if the index is even (in) or odd (not in). And as explained in
+"\fBprop_invmap()\fR", the index is used with the returned parallel array to find
+the mapping.
+.SS Unicode::UCD::UnicodeVersion
+.IX Subsection "Unicode::UCD::UnicodeVersion"
+This returns the version of the Unicode Character Database, in other words, the
+version of the Unicode standard the database implements. The version is a
+string of numbers delimited by dots (\f(CW\*(Aq.\*(Aq\fR).
+.SS "\fBBlocks versus Scripts\fP"
+.IX Subsection "Blocks versus Scripts"
+The difference between a block and a script is that scripts are closer
+to the linguistic notion of a set of code points required to represent
+languages, while block is more of an artifact of the Unicode code point
+numbering and separation into blocks of consecutive code points (so far the
+size of a block is some multiple of 16, like 128 or 256).
+.PP
+For example the Latin \fBscript\fR is spread over several \fBblocks\fR, such
+as \f(CW\*(C`Basic Latin\*(C'\fR, \f(CW\*(C`Latin 1 Supplement\*(C'\fR, \f(CW\*(C`Latin Extended\-A\*(C'\fR, and
+\&\f(CW\*(C`Latin Extended\-B\*(C'\fR. On the other hand, the Latin script does not
+contain all the characters of the \f(CW\*(C`Basic Latin\*(C'\fR block (also known as
+ASCII): it includes only the letters, and not, for example, the digits
+nor the punctuation.
+.PP
+For blocks see <http://www.unicode.org/Public/UNIDATA/Blocks.txt>
+.PP
+For scripts see UTR #24: <http://www.unicode.org/reports/tr24/>
+.SS "\fBMatching Scripts and Blocks\fP"
+.IX Subsection "Matching Scripts and Blocks"
+Scripts are matched with the regular-expression construct
+\&\f(CW\*(C`\ep{...}\*(C'\fR (e.g. \f(CW\*(C`\ep{Tibetan}\*(C'\fR matches characters of the Tibetan script),
+while \f(CW\*(C`\ep{Blk=...}\*(C'\fR is used for blocks (e.g. \f(CW\*(C`\ep{Blk=Tibetan}\*(C'\fR matches
+any of the 256 code points in the Tibetan block).
+.SS "Old-style versus new-style block names"
+.IX Subsection "Old-style versus new-style block names"
+Unicode publishes the names of blocks in two different styles, though the two
+are equivalent under Unicode's loose matching rules.
+.PP
+The original style uses blanks and hyphens in the block names (except for
+\&\f(CW\*(C`No_Block\*(C'\fR), like so:
+.PP
+.Vb 1
+\& Miscellaneous Mathematical Symbols\-B
+.Ve
+.PP
+The newer style replaces these with underscores, like this:
+.PP
+.Vb 1
+\& Miscellaneous_Mathematical_Symbols_B
+.Ve
+.PP
+This newer style is consistent with the values of other Unicode properties.
+To preserve backward compatibility, all the functions in Unicode::UCD that
+return block names (except as noted) return the old-style ones.
+"\fBprop_value_aliases()\fR" returns the new-style and can be used to convert from
+old-style to new-style:
+.PP
+.Vb 1
+\& my $new_style = prop_values_aliases("block", $old_style);
+.Ve
+.PP
+Perl also has single-form extensions that refer to blocks, \f(CW\*(C`In_Cyrillic\*(C'\fR,
+meaning \f(CW\*(C`Block=Cyrillic\*(C'\fR. These have always been written in the new style.
+.PP
+To convert from new-style to old-style, follow this recipe:
+.PP
+.Vb 1
+\& $old_style = charblock((prop_invlist("block=$new_style"))[0]);
+.Ve
+.PP
+(which finds the range of code points in the block using \f(CW\*(C`prop_invlist\*(C'\fR,
+gets the lower end of the range (0th element) and then looks up the old name
+for its block using \f(CW\*(C`charblock\*(C'\fR).
+.PP
+Note that starting in Unicode 6.1, many of the block names have shorter
+synonyms. These are always given in the new style.
+.SS "Use with older Unicode versions"
+.IX Subsection "Use with older Unicode versions"
+The functions in this module work as well as can be expected when
+used on earlier Unicode versions. But, obviously, they use the available data
+from that Unicode version. For example, if the Unicode version predates the
+definition of the script property (Unicode 3.1), then any function that deals
+with scripts is going to return \f(CW\*(C`undef\*(C'\fR for the script portion of the return
+value.
+.SH AUTHOR
+.IX Header "AUTHOR"
+Jarkko Hietaniemi. Now maintained by perl5 porters.