summaryrefslogtreecommitdiffstats
path: root/libraries/liblunicode/ure/README
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--libraries/liblunicode/ure/README212
1 files changed, 212 insertions, 0 deletions
diff --git a/libraries/liblunicode/ure/README b/libraries/liblunicode/ure/README
new file mode 100644
index 0000000..c9918f5
--- /dev/null
+++ b/libraries/liblunicode/ure/README
@@ -0,0 +1,212 @@
+#
+# $Id: README,v 1.3 1999/09/21 15:47:43 mleisher Exp $
+#
+# Copyright 1997, 1998, 1999 Computing Research Labs,
+# New Mexico State University
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
+# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
+# OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+# THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+
+
+ Unicode and Regular Expressions
+ Version 0.5
+
+This is a simple regular expression package for matching against Unicode text
+in UCS2 form. The implementation of this URE package is a variation on the
+RE->DFA algorithm done by Mark Hopkins (markh@csd4.csd.uwm.edu). Mark
+Hopkins' algorithm had the virtue of being very simple, so it was used as a
+model.
+
+---------------------------------------------------------------------------
+
+Assumptions:
+
+ o Regular expression and text already normalized.
+
+ o Conversion to lower case assumes a 1-1 mapping.
+
+Definitions:
+
+ Separator - any one of U+2028, U+2029, '\n', '\r'.
+
+Operators:
+ . - match any character.
+ * - match zero or more of the last subexpression.
+ + - match one or more of the last subexpression.
+ ? - match zero or one of the last subexpression.
+ () - subexpression grouping.
+
+ Notes:
+
+ o The "." operator normally does not match separators, but a flag is
+ available for the ure_exec() function that will allow this operator to
+ match a separator.
+
+Literals and Constants:
+
+ c - literal UCS2 character.
+ \x.... - hexadecimal number of up to 4 digits.
+ \X.... - hexadecimal number of up to 4 digits.
+ \u.... - hexadecimal number of up to 4 digits.
+ \U.... - hexadecimal number of up to 4 digits.
+
+Character classes:
+
+ [...] - Character class.
+ [^...] - Negated character class.
+ \pN1,N2,...,Nn - Character properties class.
+ \PN1,N2,...,Nn - Negated character properties class.
+
+ POSIX character classes recognized:
+
+ :alnum:
+ :alpha:
+ :cntrl:
+ :digit:
+ :graph:
+ :lower:
+ :print:
+ :punct:
+ :space:
+ :upper:
+ :xdigit:
+
+ Notes:
+
+ o Character property classes are \p or \P followed by a comma separated
+ list of integers between 1 and 32. These integers are references to
+ the following character properties:
+
+ N Character Property
+ --------------------------
+ 1 _URE_NONSPACING
+ 2 _URE_COMBINING
+ 3 _URE_NUMDIGIT
+ 4 _URE_NUMOTHER
+ 5 _URE_SPACESEP
+ 6 _URE_LINESEP
+ 7 _URE_PARASEP
+ 8 _URE_CNTRL
+ 9 _URE_PUA
+ 10 _URE_UPPER
+ 11 _URE_LOWER
+ 12 _URE_TITLE
+ 13 _URE_MODIFIER
+ 14 _URE_OTHERLETTER
+ 15 _URE_DASHPUNCT
+ 16 _URE_OPENPUNCT
+ 17 _URE_CLOSEPUNCT
+ 18 _URE_OTHERPUNCT
+ 19 _URE_MATHSYM
+ 20 _URE_CURRENCYSYM
+ 21 _URE_OTHERSYM
+ 22 _URE_LTR
+ 23 _URE_RTL
+ 24 _URE_EURONUM
+ 25 _URE_EURONUMSEP
+ 26 _URE_EURONUMTERM
+ 27 _URE_ARABNUM
+ 28 _URE_COMMONSEP
+ 29 _URE_BLOCKSEP
+ 30 _URE_SEGMENTSEP
+ 31 _URE_WHITESPACE
+ 32 _URE_OTHERNEUT
+
+ o Character classes can contain literals, constants, and character
+ property classes. Example:
+
+ [abc\U10A\p1,3,4]
+
+---------------------------------------------------------------------------
+
+Before using URE
+----------------
+Before URE is used, two functions need to be created. One to check if a
+character matches a set of URE character properties, and one to convert a
+character to lower case.
+
+Stubs for these function are located in the urestubs.c file.
+
+Using URE
+---------
+
+Sample pseudo-code fragment.
+
+ ure_buffer_t rebuf;
+ ure_dfa_t dfa;
+ ucs2_t *re, *text;
+ unsigned long relen, textlen;
+ unsigned long match_start, match_end;
+
+ /*
+ * Allocate the dynamic storage needed to compile regular expressions.
+ */
+ rebuf = ure_buffer_create();
+
+ for each regular expression in a list {
+ re = next regular expression;
+ relen = length(re);
+
+ /*
+ * Compile the regular expression with the case insensitive flag
+ * turned on.
+ */
+ dfa = ure_compile(re, relen, 1, rebuf);
+
+ /*
+ * Look for the first match in some text. The matching will be done
+ * in a case insensitive manner because the expression was compiled
+ * with the case insensitive flag on.
+ */
+ if (ure_exec(dfa, 0, text, textlen, &match_start, &match_end))
+ printf("MATCH: %ld %ld\n", match_start, match_end);
+
+ /*
+ * Look for the first match in some text, ignoring non-spacing
+ * characters.
+ */
+ if (ure_exec(dfa, URE_IGNORE_NONSPACING, text, textlen,
+ &match_start, &match_end))
+ printf("MATCH: %ld %ld\n", match_start, match_end);
+
+ /*
+ * Free the DFA.
+ */
+ ure_free_dfa(dfa);
+ }
+
+ /*
+ * Free the dynamic storage used for compiling the expressions.
+ */
+ ure_free_buffer(rebuf);
+
+---------------------------------------------------------------------------
+
+Mark Leisher <mleisher@crl.nmsu.edu>
+29 March 1997
+
+===========================================================================
+
+CHANGES
+-------
+
+Version: 0.5
+Date : 21 September 1999
+==========================
+ 1. Added copyright stuff and put in CVS.