summaryrefslogtreecommitdiffstats
path: root/upstream/fedora-40/man1/perlreapi.1
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-15 19:43:11 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-15 19:43:11 +0000
commitfc22b3d6507c6745911b9dfcc68f1e665ae13dbc (patch)
treece1e3bce06471410239a6f41282e328770aa404a /upstream/fedora-40/man1/perlreapi.1
parentInitial commit. (diff)
downloadmanpages-l10n-fc22b3d6507c6745911b9dfcc68f1e665ae13dbc.tar.xz
manpages-l10n-fc22b3d6507c6745911b9dfcc68f1e665ae13dbc.zip
Adding upstream version 4.22.0.upstream/4.22.0
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'upstream/fedora-40/man1/perlreapi.1')
-rw-r--r--upstream/fedora-40/man1/perlreapi.1925
1 files changed, 925 insertions, 0 deletions
diff --git a/upstream/fedora-40/man1/perlreapi.1 b/upstream/fedora-40/man1/perlreapi.1
new file mode 100644
index 00000000..05dee3ac
--- /dev/null
+++ b/upstream/fedora-40/man1/perlreapi.1
@@ -0,0 +1,925 @@
+.\" -*- mode: troff; coding: utf-8 -*-
+.\" Automatically generated by Pod::Man 5.01 (Pod::Simple 3.43)
+.\"
+.\" Standard preamble:
+.\" ========================================================================
+.de Sp \" Vertical space (when we can't use .PP)
+.if t .sp .5v
+.if n .sp
+..
+.de Vb \" Begin verbatim text
+.ft CW
+.nf
+.ne \\$1
+..
+.de Ve \" End verbatim text
+.ft R
+.fi
+..
+.\" \*(C` and \*(C' are quotes in nroff, nothing in troff, for use with C<>.
+.ie n \{\
+. ds C` ""
+. ds C' ""
+'br\}
+.el\{\
+. ds C`
+. ds C'
+'br\}
+.\"
+.\" Escape single quotes in literal strings from groff's Unicode transform.
+.ie \n(.g .ds Aq \(aq
+.el .ds Aq '
+.\"
+.\" If the F register is >0, we'll generate index entries on stderr for
+.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
+.\" entries marked with X<> in POD. Of course, you'll have to process the
+.\" output yourself in some meaningful fashion.
+.\"
+.\" Avoid warning from groff about undefined register 'F'.
+.de IX
+..
+.nr rF 0
+.if \n(.g .if rF .nr rF 1
+.if (\n(rF:(\n(.g==0)) \{\
+. if \nF \{\
+. de IX
+. tm Index:\\$1\t\\n%\t"\\$2"
+..
+. if !\nF==2 \{\
+. nr % 0
+. nr F 2
+. \}
+. \}
+.\}
+.rr rF
+.\" ========================================================================
+.\"
+.IX Title "PERLREAPI 1"
+.TH PERLREAPI 1 2024-01-25 "perl v5.38.2" "Perl Programmers Reference Guide"
+.\" For nroff, turn off justification. Always turn off hyphenation; it makes
+.\" way too many mistakes in technical documents.
+.if n .ad l
+.nh
+.SH NAME
+perlreapi \- Perl regular expression plugin interface
+.SH DESCRIPTION
+.IX Header "DESCRIPTION"
+As of Perl 5.9.5 there is a new interface for plugging and using
+regular expression engines other than the default one.
+.PP
+Each engine is supposed to provide access to a constant structure of the
+following format:
+.PP
+.Vb 10
+\& typedef struct regexp_engine {
+\& REGEXP* (*comp) (pTHX_
+\& const SV * const pattern, const U32 flags);
+\& I32 (*exec) (pTHX_
+\& REGEXP * const rx,
+\& char* stringarg,
+\& char* strend, char* strbeg,
+\& SSize_t minend, SV* sv,
+\& void* data, U32 flags);
+\& char* (*intuit) (pTHX_
+\& REGEXP * const rx, SV *sv,
+\& const char * const strbeg,
+\& char *strpos, char *strend, U32 flags,
+\& struct re_scream_pos_data_s *data);
+\& SV* (*checkstr) (pTHX_ REGEXP * const rx);
+\& void (*free) (pTHX_ REGEXP * const rx);
+\& void (*numbered_buff_FETCH) (pTHX_
+\& REGEXP * const rx,
+\& const I32 paren,
+\& SV * const sv);
+\& void (*numbered_buff_STORE) (pTHX_
+\& REGEXP * const rx,
+\& const I32 paren,
+\& SV const * const value);
+\& I32 (*numbered_buff_LENGTH) (pTHX_
+\& REGEXP * const rx,
+\& const SV * const sv,
+\& const I32 paren);
+\& SV* (*named_buff) (pTHX_
+\& REGEXP * const rx,
+\& SV * const key,
+\& SV * const value,
+\& U32 flags);
+\& SV* (*named_buff_iter) (pTHX_
+\& REGEXP * const rx,
+\& const SV * const lastkey,
+\& const U32 flags);
+\& SV* (*qr_package)(pTHX_ REGEXP * const rx);
+\& #ifdef USE_ITHREADS
+\& void* (*dupe) (pTHX_ REGEXP * const rx, CLONE_PARAMS *param);
+\& #endif
+\& REGEXP* (*op_comp) (...);
+.Ve
+.PP
+When a regexp is compiled, its \f(CW\*(C`engine\*(C'\fR field is then set to point at
+the appropriate structure, so that when it needs to be used Perl can find
+the right routines to do so.
+.PP
+In order to install a new regexp handler, \f(CW$^H{regcomp}\fR is set
+to an integer which (when casted appropriately) resolves to one of these
+structures. When compiling, the \f(CW\*(C`comp\*(C'\fR method is executed, and the
+resulting \f(CW\*(C`regexp\*(C'\fR structure's engine field is expected to point back at
+the same structure.
+.PP
+The pTHX_ symbol in the definition is a macro used by Perl under threading
+to provide an extra argument to the routine holding a pointer back to
+the interpreter that is executing the regexp. So under threading all
+routines get an extra argument.
+.SH Callbacks
+.IX Header "Callbacks"
+.SS comp
+.IX Subsection "comp"
+.Vb 1
+\& REGEXP* comp(pTHX_ const SV * const pattern, const U32 flags);
+.Ve
+.PP
+Compile the pattern stored in \f(CW\*(C`pattern\*(C'\fR using the given \f(CW\*(C`flags\*(C'\fR and
+return a pointer to a prepared \f(CW\*(C`REGEXP\*(C'\fR structure that can perform
+the match. See "The REGEXP structure" below for an explanation of
+the individual fields in the REGEXP struct.
+.PP
+The \f(CW\*(C`pattern\*(C'\fR parameter is the scalar that was used as the
+pattern. Previous versions of Perl would pass two \f(CW\*(C`char*\*(C'\fR indicating
+the start and end of the stringified pattern; the following snippet can
+be used to get the old parameters:
+.PP
+.Vb 3
+\& STRLEN plen;
+\& char* exp = SvPV(pattern, plen);
+\& char* xend = exp + plen;
+.Ve
+.PP
+Since any scalar can be passed as a pattern, it's possible to implement
+an engine that does something with an array (\f(CW\*(C`"ook" =~ [ qw/ eek
+hlagh / ]\*(C'\fR) or with the non-stringified form of a compiled regular
+expression (\f(CW\*(C`"ook" =~ qr/eek/\*(C'\fR). Perl's own engine will always
+stringify everything using the snippet above, but that doesn't mean
+other engines have to.
+.PP
+The \f(CW\*(C`flags\*(C'\fR parameter is a bitfield which indicates which of the
+\&\f(CW\*(C`msixpn\*(C'\fR flags the regex was compiled with. It also contains
+additional info, such as if \f(CW\*(C`use locale\*(C'\fR is in effect.
+.PP
+The \f(CW\*(C`eogc\*(C'\fR flags are stripped out before being passed to the comp
+routine. The regex engine does not need to know if any of these
+are set, as those flags should only affect what Perl does with the
+pattern and its match variables, not how it gets compiled and
+executed.
+.PP
+By the time the comp callback is called, some of these flags have
+already had effect (noted below where applicable). However most of
+their effect occurs after the comp callback has run, in routines that
+read the \f(CW\*(C`rx\->extflags\*(C'\fR field which it populates.
+.PP
+In general the flags should be preserved in \f(CW\*(C`rx\->extflags\*(C'\fR after
+compilation, although the regex engine might want to add or delete
+some of them to invoke or disable some special behavior in Perl. The
+flags along with any special behavior they cause are documented below:
+.PP
+The pattern modifiers:
+.ie n .IP """/m"" \- RXf_PMf_MULTILINE" 4
+.el .IP "\f(CW/m\fR \- RXf_PMf_MULTILINE" 4
+.IX Item "/m - RXf_PMf_MULTILINE"
+If this is in \f(CW\*(C`rx\->extflags\*(C'\fR it will be passed to
+\&\f(CW\*(C`Perl_fbm_instr\*(C'\fR by \f(CW\*(C`pp_split\*(C'\fR which will treat the subject string
+as a multi-line string.
+.ie n .IP """/s"" \- RXf_PMf_SINGLELINE" 4
+.el .IP "\f(CW/s\fR \- RXf_PMf_SINGLELINE" 4
+.IX Item "/s - RXf_PMf_SINGLELINE"
+.PD 0
+.ie n .IP """/i"" \- RXf_PMf_FOLD" 4
+.el .IP "\f(CW/i\fR \- RXf_PMf_FOLD" 4
+.IX Item "/i - RXf_PMf_FOLD"
+.ie n .IP """/x"" \- RXf_PMf_EXTENDED" 4
+.el .IP "\f(CW/x\fR \- RXf_PMf_EXTENDED" 4
+.IX Item "/x - RXf_PMf_EXTENDED"
+.PD
+If present on a regex, \f(CW"#"\fR comments will be handled differently by the
+tokenizer in some cases.
+.Sp
+TODO: Document those cases.
+.ie n .IP """/p"" \- RXf_PMf_KEEPCOPY" 4
+.el .IP "\f(CW/p\fR \- RXf_PMf_KEEPCOPY" 4
+.IX Item "/p - RXf_PMf_KEEPCOPY"
+TODO: Document this
+.IP "Character set" 4
+.IX Item "Character set"
+The character set rules are determined by an enum that is contained
+in this field. This is still experimental and subject to change, but
+the current interface returns the rules by use of the in-line function
+\&\f(CW\*(C`get_regex_charset(const U32 flags)\*(C'\fR. The only currently documented
+value returned from it is REGEX_LOCALE_CHARSET, which is set if
+\&\f(CW\*(C`use locale\*(C'\fR is in effect. If present in \f(CW\*(C`rx\->extflags\*(C'\fR,
+\&\f(CW\*(C`split\*(C'\fR will use the locale dependent definition of whitespace
+when RXf_SKIPWHITE or RXf_WHITE is in effect. ASCII whitespace
+is defined as per isSPACE, and by the internal
+macros \f(CW\*(C`is_utf8_space\*(C'\fR under UTF\-8, and \f(CW\*(C`isSPACE_LC\*(C'\fR under \f(CW\*(C`use
+locale\*(C'\fR.
+.PP
+Additional flags:
+.IP RXf_SPLIT 4
+.IX Item "RXf_SPLIT"
+This flag was removed in perl 5.18.0. \f(CW\*(C`split \*(Aq \*(Aq\*(C'\fR is now special-cased
+solely in the parser. RXf_SPLIT is still #defined, so you can test for it.
+This is how it used to work:
+.Sp
+If \f(CW\*(C`split\*(C'\fR is invoked as \f(CW\*(C`split \*(Aq \*(Aq\*(C'\fR or with no arguments (which
+really means \f(CW\*(C`split(\*(Aq \*(Aq, $_)\*(C'\fR, see split), Perl will
+set this flag. The regex engine can then check for it and set the
+SKIPWHITE and WHITE extflags. To do this, the Perl engine does:
+.Sp
+.Vb 2
+\& if (flags & RXf_SPLIT && r\->prelen == 1 && r\->precomp[0] == \*(Aq \*(Aq)
+\& r\->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
+.Ve
+.PP
+These flags can be set during compilation to enable optimizations in
+the \f(CW\*(C`split\*(C'\fR operator.
+.IP RXf_SKIPWHITE 4
+.IX Item "RXf_SKIPWHITE"
+This flag was removed in perl 5.18.0. It is still #defined, so you can
+set it, but doing so will have no effect. This is how it used to work:
+.Sp
+If the flag is present in \f(CW\*(C`rx\->extflags\*(C'\fR \f(CW\*(C`split\*(C'\fR will delete
+whitespace from the start of the subject string before it's operated
+on. What is considered whitespace depends on if the subject is a
+UTF\-8 string and if the \f(CW\*(C`RXf_PMf_LOCALE\*(C'\fR flag is set.
+.Sp
+If RXf_WHITE is set in addition to this flag, \f(CW\*(C`split\*(C'\fR will behave like
+\&\f(CW\*(C`split " "\*(C'\fR under the Perl engine.
+.IP RXf_START_ONLY 4
+.IX Item "RXf_START_ONLY"
+Tells the split operator to split the target string on newlines
+(\f(CW\*(C`\en\*(C'\fR) without invoking the regex engine.
+.Sp
+Perl's engine sets this if the pattern is \f(CW\*(C`/^/\*(C'\fR (\f(CW\*(C`plen == 1 && *exp
+== \*(Aq^\*(Aq\*(C'\fR), even under \f(CW\*(C`/^/s\*(C'\fR; see split. Of course a
+different regex engine might want to use the same optimizations
+with a different syntax.
+.IP RXf_WHITE 4
+.IX Item "RXf_WHITE"
+Tells the split operator to split the target string on whitespace
+without invoking the regex engine. The definition of whitespace varies
+depending on if the target string is a UTF\-8 string and on
+if RXf_PMf_LOCALE is set.
+.Sp
+Perl's engine sets this flag if the pattern is \f(CW\*(C`\es+\*(C'\fR.
+.IP RXf_NULL 4
+.IX Item "RXf_NULL"
+Tells the split operator to split the target string on
+characters. The definition of character varies depending on if
+the target string is a UTF\-8 string.
+.Sp
+Perl's engine sets this flag on empty patterns, this optimization
+makes \f(CW\*(C`split //\*(C'\fR much faster than it would otherwise be. It's even
+faster than \f(CW\*(C`unpack\*(C'\fR.
+.IP RXf_NO_INPLACE_SUBST 4
+.IX Item "RXf_NO_INPLACE_SUBST"
+Added in perl 5.18.0, this flag indicates that a regular expression might
+perform an operation that would interfere with inplace substitution. For
+instance it might contain lookbehind, or assign to non-magical variables
+(such as \f(CW$REGMARK\fR and \f(CW$REGERROR\fR) during matching. \f(CW\*(C`s///\*(C'\fR will skip
+certain optimisations when this is set.
+.SS exec
+.IX Subsection "exec"
+.Vb 4
+\& I32 exec(pTHX_ REGEXP * const rx,
+\& char *stringarg, char* strend, char* strbeg,
+\& SSize_t minend, SV* sv,
+\& void* data, U32 flags);
+.Ve
+.PP
+Execute a regexp. The arguments are
+.IP rx 4
+.IX Item "rx"
+The regular expression to execute.
+.IP sv 4
+.IX Item "sv"
+This is the SV to be matched against. Note that the
+actual char array to be matched against is supplied by the arguments
+described below; the SV is just used to determine UTF8ness, \f(CWpos()\fR etc.
+.IP strbeg 4
+.IX Item "strbeg"
+Pointer to the physical start of the string.
+.IP strend 4
+.IX Item "strend"
+Pointer to the character following the physical end of the string (i.e.
+the \f(CW\*(C`\e0\*(C'\fR, if any).
+.IP stringarg 4
+.IX Item "stringarg"
+Pointer to the position in the string where matching should start; it might
+not be equal to \f(CW\*(C`strbeg\*(C'\fR (for example in a later iteration of \f(CW\*(C`/.../g\*(C'\fR).
+.IP minend 4
+.IX Item "minend"
+Minimum length of string (measured in bytes from \f(CW\*(C`stringarg\*(C'\fR) that must
+match; if the engine reaches the end of the match but hasn't reached this
+position in the string, it should fail.
+.IP data 4
+.IX Item "data"
+Optimisation data; subject to change.
+.IP flags 4
+.IX Item "flags"
+Optimisation flags; subject to change.
+.SS intuit
+.IX Subsection "intuit"
+.Vb 8
+\& char* intuit(pTHX_
+\& REGEXP * const rx,
+\& SV *sv,
+\& const char * const strbeg,
+\& char *strpos,
+\& char *strend,
+\& const U32 flags,
+\& struct re_scream_pos_data_s *data);
+.Ve
+.PP
+Find the start position where a regex match should be attempted,
+or possibly if the regex engine should not be run because the
+pattern can't match. This is called, as appropriate, by the core,
+depending on the values of the \f(CW\*(C`extflags\*(C'\fR member of the \f(CW\*(C`regexp\*(C'\fR
+structure.
+.PP
+Arguments:
+.PP
+.Vb 11
+\& rx: the regex to match against
+\& sv: the SV being matched: only used for utf8 flag; the string
+\& itself is accessed via the pointers below. Note that on
+\& something like an overloaded SV, SvPOK(sv) may be false
+\& and the string pointers may point to something unrelated to
+\& the SV itself.
+\& strbeg: real beginning of string
+\& strpos: the point in the string at which to begin matching
+\& strend: pointer to the byte following the last char of the string
+\& flags currently unused; set to 0
+\& data: currently unused; set to NULL
+.Ve
+.SS checkstr
+.IX Subsection "checkstr"
+.Vb 1
+\& SV* checkstr(pTHX_ REGEXP * const rx);
+.Ve
+.PP
+Return a SV containing a string that must appear in the pattern. Used
+by \f(CW\*(C`split\*(C'\fR for optimising matches.
+.SS free
+.IX Subsection "free"
+.Vb 1
+\& void free(pTHX_ REGEXP * const rx);
+.Ve
+.PP
+Called by Perl when it is freeing a regexp pattern so that the engine
+can release any resources pointed to by the \f(CW\*(C`pprivate\*(C'\fR member of the
+\&\f(CW\*(C`regexp\*(C'\fR structure. This is only responsible for freeing private data;
+Perl will handle releasing anything else contained in the \f(CW\*(C`regexp\*(C'\fR structure.
+.SS "Numbered capture callbacks"
+.IX Subsection "Numbered capture callbacks"
+Called to get/set the value of \f(CW\*(C`$\`\*(C'\fR, \f(CW\*(C`$\*(Aq\*(C'\fR, \f(CW$&\fR and their named
+equivalents, ${^PREMATCH}, ${^POSTMATCH} and ${^MATCH}, as well as the
+numbered capture groups (\f(CW$1\fR, \f(CW$2\fR, ...).
+.PP
+The \f(CW\*(C`paren\*(C'\fR parameter will be \f(CW1\fR for \f(CW$1\fR, \f(CW2\fR for \f(CW$2\fR and so
+forth, and have these symbolic values for the special variables:
+.PP
+.Vb 6
+\& ${^PREMATCH} RX_BUFF_IDX_CARET_PREMATCH
+\& ${^POSTMATCH} RX_BUFF_IDX_CARET_POSTMATCH
+\& ${^MATCH} RX_BUFF_IDX_CARET_FULLMATCH
+\& $\` RX_BUFF_IDX_PREMATCH
+\& $\*(Aq RX_BUFF_IDX_POSTMATCH
+\& $& RX_BUFF_IDX_FULLMATCH
+.Ve
+.PP
+Note that in Perl 5.17.3 and earlier, the last three constants were also
+used for the caret variants of the variables.
+.PP
+The names have been chosen by analogy with Tie::Scalar methods
+names with an additional \fBLENGTH\fR callback for efficiency. However
+named capture variables are currently not tied internally but
+implemented via magic.
+.PP
+\fInumbered_buff_FETCH\fR
+.IX Subsection "numbered_buff_FETCH"
+.PP
+.Vb 2
+\& void numbered_buff_FETCH(pTHX_ REGEXP * const rx, const I32 paren,
+\& SV * const sv);
+.Ve
+.PP
+Fetch a specified numbered capture. \f(CW\*(C`sv\*(C'\fR should be set to the scalar
+to return, the scalar is passed as an argument rather than being
+returned from the function because when it's called Perl already has a
+scalar to store the value, creating another one would be
+redundant. The scalar can be set with \f(CW\*(C`sv_setsv\*(C'\fR, \f(CW\*(C`sv_setpvn\*(C'\fR and
+friends, see perlapi.
+.PP
+This callback is where Perl untaints its own capture variables under
+taint mode (see perlsec). See the \f(CW\*(C`Perl_reg_numbered_buff_fetch\*(C'\fR
+function in \fIregcomp.c\fR for how to untaint capture variables if
+that's something you'd like your engine to do as well.
+.PP
+\fInumbered_buff_STORE\fR
+.IX Subsection "numbered_buff_STORE"
+.PP
+.Vb 4
+\& void (*numbered_buff_STORE) (pTHX_
+\& REGEXP * const rx,
+\& const I32 paren,
+\& SV const * const value);
+.Ve
+.PP
+Set the value of a numbered capture variable. \f(CW\*(C`value\*(C'\fR is the scalar
+that is to be used as the new value. It's up to the engine to make
+sure this is used as the new value (or reject it).
+.PP
+Example:
+.PP
+.Vb 4
+\& if ("ook" =~ /(o*)/) {
+\& # \*(Aqparen\*(Aq will be \*(Aq1\*(Aq and \*(Aqvalue\*(Aq will be \*(Aqee\*(Aq
+\& $1 =~ tr/o/e/;
+\& }
+.Ve
+.PP
+Perl's own engine will croak on any attempt to modify the capture
+variables, to do this in another engine use the following callback
+(copied from \f(CW\*(C`Perl_reg_numbered_buff_store\*(C'\fR):
+.PP
+.Vb 9
+\& void
+\& Example_reg_numbered_buff_store(pTHX_
+\& REGEXP * const rx,
+\& const I32 paren,
+\& SV const * const value)
+\& {
+\& PERL_UNUSED_ARG(rx);
+\& PERL_UNUSED_ARG(paren);
+\& PERL_UNUSED_ARG(value);
+\&
+\& if (!PL_localizing)
+\& Perl_croak(aTHX_ PL_no_modify);
+\& }
+.Ve
+.PP
+Actually Perl will not \fIalways\fR croak in a statement that looks
+like it would modify a numbered capture variable. This is because the
+STORE callback will not be called if Perl can determine that it
+doesn't have to modify the value. This is exactly how tied variables
+behave in the same situation:
+.PP
+.Vb 2
+\& package CaptureVar;
+\& use parent \*(AqTie::Scalar\*(Aq;
+\&
+\& sub TIESCALAR { bless [] }
+\& sub FETCH { undef }
+\& sub STORE { die "This doesn\*(Aqt get called" }
+\&
+\& package main;
+\&
+\& tie my $sv => "CaptureVar";
+\& $sv =~ y/a/b/;
+.Ve
+.PP
+Because \f(CW$sv\fR is \f(CW\*(C`undef\*(C'\fR when the \f(CW\*(C`y///\*(C'\fR operator is applied to it,
+the transliteration won't actually execute and the program won't
+\&\f(CW\*(C`die\*(C'\fR. This is different to how 5.8 and earlier versions behaved
+since the capture variables were READONLY variables then; now they'll
+just die when assigned to in the default engine.
+.PP
+\fInumbered_buff_LENGTH\fR
+.IX Subsection "numbered_buff_LENGTH"
+.PP
+.Vb 4
+\& I32 numbered_buff_LENGTH (pTHX_
+\& REGEXP * const rx,
+\& const SV * const sv,
+\& const I32 paren);
+.Ve
+.PP
+Get the \f(CW\*(C`length\*(C'\fR of a capture variable. There's a special callback
+for this so that Perl doesn't have to do a FETCH and run \f(CW\*(C`length\*(C'\fR on
+the result, since the length is (in Perl's case) known from an offset
+stored in \f(CW\*(C`rx\->offs\*(C'\fR, this is much more efficient:
+.PP
+.Vb 3
+\& I32 s1 = rx\->offs[paren].start;
+\& I32 s2 = rx\->offs[paren].end;
+\& I32 len = t1 \- s1;
+.Ve
+.PP
+This is a little bit more complex in the case of UTF\-8, see what
+\&\f(CW\*(C`Perl_reg_numbered_buff_length\*(C'\fR does with
+is_utf8_string_loclen.
+.SS "Named capture callbacks"
+.IX Subsection "Named capture callbacks"
+Called to get/set the value of \f(CW\*(C`%+\*(C'\fR and \f(CW\*(C`%\-\*(C'\fR, as well as by some
+utility functions in re.
+.PP
+There are two callbacks, \f(CW\*(C`named_buff\*(C'\fR is called in all the cases the
+FETCH, STORE, DELETE, CLEAR, EXISTS and SCALAR Tie::Hash callbacks
+would be on changes to \f(CW\*(C`%+\*(C'\fR and \f(CW\*(C`%\-\*(C'\fR and \f(CW\*(C`named_buff_iter\*(C'\fR in the
+same cases as FIRSTKEY and NEXTKEY.
+.PP
+The \f(CW\*(C`flags\*(C'\fR parameter can be used to determine which of these
+operations the callbacks should respond to. The following flags are
+currently defined:
+.PP
+Which Tie::Hash operation is being performed from the Perl level on
+\&\f(CW\*(C`%+\*(C'\fR or \f(CW\*(C`%+\*(C'\fR, if any:
+.PP
+.Vb 8
+\& RXapif_FETCH
+\& RXapif_STORE
+\& RXapif_DELETE
+\& RXapif_CLEAR
+\& RXapif_EXISTS
+\& RXapif_SCALAR
+\& RXapif_FIRSTKEY
+\& RXapif_NEXTKEY
+.Ve
+.PP
+If \f(CW\*(C`%+\*(C'\fR or \f(CW\*(C`%\-\*(C'\fR is being operated on, if any.
+.PP
+.Vb 2
+\& RXapif_ONE /* %+ */
+\& RXapif_ALL /* %\- */
+.Ve
+.PP
+If this is being called as \f(CW\*(C`re::regname\*(C'\fR, \f(CW\*(C`re::regnames\*(C'\fR or
+\&\f(CW\*(C`re::regnames_count\*(C'\fR, if any. The first two will be combined with
+\&\f(CW\*(C`RXapif_ONE\*(C'\fR or \f(CW\*(C`RXapif_ALL\*(C'\fR.
+.PP
+.Vb 3
+\& RXapif_REGNAME
+\& RXapif_REGNAMES
+\& RXapif_REGNAMES_COUNT
+.Ve
+.PP
+Internally \f(CW\*(C`%+\*(C'\fR and \f(CW\*(C`%\-\*(C'\fR are implemented with a real tied interface
+via Tie::Hash::NamedCapture. The methods in that package will call
+back into these functions. However the usage of
+Tie::Hash::NamedCapture for this purpose might change in future
+releases. For instance this might be implemented by magic instead
+(would need an extension to mgvtbl).
+.PP
+\fInamed_buff\fR
+.IX Subsection "named_buff"
+.PP
+.Vb 2
+\& SV* (*named_buff) (pTHX_ REGEXP * const rx, SV * const key,
+\& SV * const value, U32 flags);
+.Ve
+.PP
+\fInamed_buff_iter\fR
+.IX Subsection "named_buff_iter"
+.PP
+.Vb 4
+\& SV* (*named_buff_iter) (pTHX_
+\& REGEXP * const rx,
+\& const SV * const lastkey,
+\& const U32 flags);
+.Ve
+.SS qr_package
+.IX Subsection "qr_package"
+.Vb 1
+\& SV* qr_package(pTHX_ REGEXP * const rx);
+.Ve
+.PP
+The package the qr// magic object is blessed into (as seen by \f(CW\*(C`ref
+qr//\*(C'\fR). It is recommended that engines change this to their package
+name for identification regardless of if they implement methods
+on the object.
+.PP
+The package this method returns should also have the internal
+\&\f(CW\*(C`Regexp\*(C'\fR package in its \f(CW@ISA\fR. \f(CW\*(C`qr//\->isa("Regexp")\*(C'\fR should always
+be true regardless of what engine is being used.
+.PP
+Example implementation might be:
+.PP
+.Vb 6
+\& SV*
+\& Example_qr_package(pTHX_ REGEXP * const rx)
+\& {
+\& PERL_UNUSED_ARG(rx);
+\& return newSVpvs("re::engine::Example");
+\& }
+.Ve
+.PP
+Any method calls on an object created with \f(CW\*(C`qr//\*(C'\fR will be dispatched to the
+package as a normal object.
+.PP
+.Vb 3
+\& use re::engine::Example;
+\& my $re = qr//;
+\& $re\->meth; # dispatched to re::engine::Example::meth()
+.Ve
+.PP
+To retrieve the \f(CW\*(C`REGEXP\*(C'\fR object from the scalar in an XS function use
+the \f(CW\*(C`SvRX\*(C'\fR macro, see "REGEXP Functions" in perlapi.
+.PP
+.Vb 3
+\& void meth(SV * rv)
+\& PPCODE:
+\& REGEXP * re = SvRX(sv);
+.Ve
+.SS dupe
+.IX Subsection "dupe"
+.Vb 1
+\& void* dupe(pTHX_ REGEXP * const rx, CLONE_PARAMS *param);
+.Ve
+.PP
+On threaded builds a regexp may need to be duplicated so that the pattern
+can be used by multiple threads. This routine is expected to handle the
+duplication of any private data pointed to by the \f(CW\*(C`pprivate\*(C'\fR member of
+the \f(CW\*(C`regexp\*(C'\fR structure. It will be called with the preconstructed new
+\&\f(CW\*(C`regexp\*(C'\fR structure as an argument, the \f(CW\*(C`pprivate\*(C'\fR member will point at
+the \fBold\fR private structure, and it is this routine's responsibility to
+construct a copy and return a pointer to it (which Perl will then use to
+overwrite the field as passed to this routine.)
+.PP
+This allows the engine to dupe its private data but also if necessary
+modify the final structure if it really must.
+.PP
+On unthreaded builds this field doesn't exist.
+.SS op_comp
+.IX Subsection "op_comp"
+This is private to the Perl core and subject to change. Should be left
+null.
+.SH "The REGEXP structure"
+.IX Header "The REGEXP structure"
+The REGEXP struct is defined in \fIregexp.h\fR.
+All regex engines must be able to
+correctly build such a structure in their "comp" routine.
+.PP
+The REGEXP structure contains all the data that Perl needs to be aware of
+to properly work with the regular expression. It includes data about
+optimisations that Perl can use to determine if the regex engine should
+really be used, and various other control info that is needed to properly
+execute patterns in various contexts, such as if the pattern anchored in
+some way, or what flags were used during the compile, or if the
+program contains special constructs that Perl needs to be aware of.
+.PP
+In addition it contains two fields that are intended for the private
+use of the regex engine that compiled the pattern. These are the
+\&\f(CW\*(C`intflags\*(C'\fR and \f(CW\*(C`pprivate\*(C'\fR members. \f(CW\*(C`pprivate\*(C'\fR is a void pointer to
+an arbitrary structure, whose use and management is the responsibility
+of the compiling engine. Perl will never modify either of these
+values.
+.PP
+.Vb 3
+\& typedef struct regexp {
+\& /* what engine created this regexp? */
+\& const struct regexp_engine* engine;
+\&
+\& /* what re is this a lightweight copy of? */
+\& struct regexp* mother_re;
+\&
+\& /* Information about the match that the Perl core uses to manage
+\& * things */
+\& U32 extflags; /* Flags used both externally and internally */
+\& I32 minlen; /* mininum possible number of chars in */
+\& string to match */
+\& I32 minlenret; /* mininum possible number of chars in $& */
+\& U32 gofs; /* chars left of pos that we search from */
+\&
+\& /* substring data about strings that must appear
+\& in the final match, used for optimisations */
+\& struct reg_substr_data *substrs;
+\&
+\& U32 nparens; /* number of capture groups */
+\&
+\& /* private engine specific data */
+\& U32 intflags; /* Engine Specific Internal flags */
+\& void *pprivate; /* Data private to the regex engine which
+\& created this object. */
+\&
+\& /* Data about the last/current match. These are modified during
+\& * matching*/
+\& U32 lastparen; /* highest close paren matched ($+) */
+\& U32 lastcloseparen; /* last close paren matched ($^N) */
+\& regexp_paren_pair *offs; /* Array of offsets for (@\-) and
+\& (@+) */
+\&
+\& char *subbeg; /* saved or original string so \edigit works
+\& forever. */
+\& SV_SAVED_COPY /* If non\-NULL, SV which is COW from original */
+\& I32 sublen; /* Length of string pointed by subbeg */
+\& I32 suboffset; /* byte offset of subbeg from logical start of
+\& str */
+\& I32 subcoffset; /* suboffset equiv, but in chars (for @\-/@+) */
+\&
+\& /* Information about the match that isn\*(Aqt often used */
+\& I32 prelen; /* length of precomp */
+\& const char *precomp; /* pre\-compilation regular expression */
+\&
+\& char *wrapped; /* wrapped version of the pattern */
+\& I32 wraplen; /* length of wrapped */
+\&
+\& I32 seen_evals; /* number of eval groups in the pattern \- for
+\& security checks */
+\& HV *paren_names; /* Optional hash of paren names */
+\&
+\& /* Refcount of this regexp */
+\& I32 refcnt; /* Refcount of this regexp */
+\& } regexp;
+.Ve
+.PP
+The fields are discussed in more detail below:
+.ie n .SS """engine"""
+.el .SS \f(CWengine\fP
+.IX Subsection "engine"
+This field points at a \f(CW\*(C`regexp_engine\*(C'\fR structure which contains pointers
+to the subroutines that are to be used for performing a match. It
+is the compiling routine's responsibility to populate this field before
+returning the regexp object.
+.PP
+Internally this is set to \f(CW\*(C`NULL\*(C'\fR unless a custom engine is specified in
+\&\f(CW$^H{regcomp}\fR, Perl's own set of callbacks can be accessed in the struct
+pointed to by \f(CW\*(C`RE_ENGINE_PTR\*(C'\fR.
+.ie n .SS """mother_re"""
+.el .SS \f(CWmother_re\fP
+.IX Subsection "mother_re"
+TODO, see commit 28d8d7f41a.
+.ie n .SS """extflags"""
+.el .SS \f(CWextflags\fP
+.IX Subsection "extflags"
+This will be used by Perl to see what flags the regexp was compiled
+with, this will normally be set to the value of the flags parameter by
+the comp callback. See the comp documentation for
+valid flags.
+.ie n .SS """minlen"" ""minlenret"""
+.el .SS "\f(CWminlen\fP \f(CWminlenret\fP"
+.IX Subsection "minlen minlenret"
+The minimum string length (in characters) required for the pattern to match.
+This is used to
+prune the search space by not bothering to match any closer to the end of a
+string than would allow a match. For instance there is no point in even
+starting the regex engine if the minlen is 10 but the string is only 5
+characters long. There is no way that the pattern can match.
+.PP
+\&\f(CW\*(C`minlenret\*(C'\fR is the minimum length (in characters) of the string that would
+be found in $& after a match.
+.PP
+The difference between \f(CW\*(C`minlen\*(C'\fR and \f(CW\*(C`minlenret\*(C'\fR can be seen in the
+following pattern:
+.PP
+.Vb 1
+\& /ns(?=\ed)/
+.Ve
+.PP
+where the \f(CW\*(C`minlen\*(C'\fR would be 3 but \f(CW\*(C`minlenret\*(C'\fR would only be 2 as the \ed is
+required to match but is not actually
+included in the matched content. This
+distinction is particularly important as the substitution logic uses the
+\&\f(CW\*(C`minlenret\*(C'\fR to tell if it can do in-place substitutions (these can
+result in considerable speed-up).
+.ie n .SS """gofs"""
+.el .SS \f(CWgofs\fP
+.IX Subsection "gofs"
+Left offset from \fBpos()\fR to start match at.
+.ie n .SS """substrs"""
+.el .SS \f(CWsubstrs\fP
+.IX Subsection "substrs"
+Substring data about strings that must appear in the final match. This
+is currently only used internally by Perl's engine, but might be
+used in the future for all engines for optimisations.
+.ie n .SS """nparens"", ""lastparen"", and ""lastcloseparen"""
+.el .SS "\f(CWnparens\fP, \f(CWlastparen\fP, and \f(CWlastcloseparen\fP"
+.IX Subsection "nparens, lastparen, and lastcloseparen"
+These fields are used to keep track of: how many paren capture groups
+there are in the pattern; which was the highest paren to be closed (see
+"$+" in perlvar); and which was the most recent paren to be closed (see
+"$^N" in perlvar).
+.ie n .SS """intflags"""
+.el .SS \f(CWintflags\fP
+.IX Subsection "intflags"
+The engine's private copy of the flags the pattern was compiled with. Usually
+this is the same as \f(CW\*(C`extflags\*(C'\fR unless the engine chose to modify one of them.
+.ie n .SS """pprivate"""
+.el .SS \f(CWpprivate\fP
+.IX Subsection "pprivate"
+A void* pointing to an engine-defined
+data structure. The Perl engine uses the
+\&\f(CW\*(C`regexp_internal\*(C'\fR structure (see "Base Structures" in perlreguts) but a custom
+engine should use something else.
+.ie n .SS """offs"""
+.el .SS \f(CWoffs\fP
+.IX Subsection "offs"
+A \f(CW\*(C`regexp_paren_pair\*(C'\fR structure which defines offsets into the string being
+matched which correspond to the \f(CW$&\fR and \f(CW$1\fR, \f(CW$2\fR etc. captures, the
+\&\f(CW\*(C`regexp_paren_pair\*(C'\fR struct is defined as follows:
+.PP
+.Vb 4
+\& typedef struct regexp_paren_pair {
+\& I32 start;
+\& I32 end;
+\& } regexp_paren_pair;
+.Ve
+.PP
+If \f(CW\*(C`\->offs[num].start\*(C'\fR or \f(CW\*(C`\->offs[num].end\*(C'\fR is \f(CW\-1\fR then that
+capture group did not match.
+\&\f(CW\*(C`\->offs[0].start/end\*(C'\fR represents \f(CW$&\fR (or
+\&\f(CW\*(C`${^MATCH}\*(C'\fR under \f(CW\*(C`/p\*(C'\fR) and \f(CW\*(C`\->offs[paren].end\*(C'\fR matches \f(CW$$paren\fR where
+\&\f(CW$paren \fR= 1>.
+.ie n .SS """precomp"" ""prelen"""
+.el .SS "\f(CWprecomp\fP \f(CWprelen\fP"
+.IX Subsection "precomp prelen"
+Used for optimisations. \f(CW\*(C`precomp\*(C'\fR holds a copy of the pattern that
+was compiled and \f(CW\*(C`prelen\*(C'\fR its length. When a new pattern is to be
+compiled (such as inside a loop) the internal \f(CW\*(C`regcomp\*(C'\fR operator
+checks if the last compiled \f(CW\*(C`REGEXP\*(C'\fR's \f(CW\*(C`precomp\*(C'\fR and \f(CW\*(C`prelen\*(C'\fR
+are equivalent to the new one, and if so uses the old pattern instead
+of compiling a new one.
+.PP
+The relevant snippet from \f(CW\*(C`Perl_pp_regcomp\*(C'\fR:
+.PP
+.Vb 3
+\& if (!re || !re\->precomp || re\->prelen != (I32)len ||
+\& memNE(re\->precomp, t, len))
+\& /* Compile a new pattern */
+.Ve
+.ie n .SS """paren_names"""
+.el .SS \f(CWparen_names\fP
+.IX Subsection "paren_names"
+This is a hash used internally to track named capture groups and their
+offsets. The keys are the names of the buffers the values are dualvars,
+with the IV slot holding the number of buffers with the given name and the
+pv being an embedded array of I32. The values may also be contained
+independently in the data array in cases where named backreferences are
+used.
+.ie n .SS """substrs"""
+.el .SS \f(CWsubstrs\fP
+.IX Subsection "substrs"
+Holds information on the longest string that must occur at a fixed
+offset from the start of the pattern, and the longest string that must
+occur at a floating offset from the start of the pattern. Used to do
+Fast-Boyer-Moore searches on the string to find out if its worth using
+the regex engine at all, and if so where in the string to search.
+.ie n .SS """subbeg"" ""sublen"" ""saved_copy"" ""suboffset"" ""subcoffset"""
+.el .SS "\f(CWsubbeg\fP \f(CWsublen\fP \f(CWsaved_copy\fP \f(CWsuboffset\fP \f(CWsubcoffset\fP"
+.IX Subsection "subbeg sublen saved_copy suboffset subcoffset"
+Used during the execution phase for managing search and replace patterns,
+and for providing the text for \f(CW$&\fR, \f(CW$1\fR etc. \f(CW\*(C`subbeg\*(C'\fR points to a
+buffer (either the original string, or a copy in the case of
+\&\f(CWRX_MATCH_COPIED(rx)\fR), and \f(CW\*(C`sublen\*(C'\fR is the length of the buffer. The
+\&\f(CW\*(C`RX_OFFS\*(C'\fR start and end indices index into this buffer.
+.PP
+In the presence of the \f(CW\*(C`REXEC_COPY_STR\*(C'\fR flag, but with the addition of
+the \f(CW\*(C`REXEC_COPY_SKIP_PRE\*(C'\fR or \f(CW\*(C`REXEC_COPY_SKIP_POST\*(C'\fR flags, an engine
+can choose not to copy the full buffer (although it must still do so in
+the presence of \f(CW\*(C`RXf_PMf_KEEPCOPY\*(C'\fR or the relevant bits being set in
+\&\f(CW\*(C`PL_sawampersand\*(C'\fR). In this case, it may set \f(CW\*(C`suboffset\*(C'\fR to indicate the
+number of bytes from the logical start of the buffer to the physical start
+(i.e. \f(CW\*(C`subbeg\*(C'\fR). It should also set \f(CW\*(C`subcoffset\*(C'\fR, the number of
+characters in the offset. The latter is needed to support \f(CW\*(C`@\-\*(C'\fR and \f(CW\*(C`@+\*(C'\fR
+which work in characters, not bytes.
+.ie n .SS """wrapped"" ""wraplen"""
+.el .SS "\f(CWwrapped\fP \f(CWwraplen\fP"
+.IX Subsection "wrapped wraplen"
+Stores the string \f(CW\*(C`qr//\*(C'\fR stringifies to. The Perl engine for example
+stores \f(CW\*(C`(?^:eek)\*(C'\fR in the case of \f(CW\*(C`qr/eek/\*(C'\fR.
+.PP
+When using a custom engine that doesn't support the \f(CW\*(C`(?:)\*(C'\fR construct
+for inline modifiers, it's probably best to have \f(CW\*(C`qr//\*(C'\fR stringify to
+the supplied pattern, note that this will create undesired patterns in
+cases such as:
+.PP
+.Vb 3
+\& my $x = qr/a|b/; # "a|b"
+\& my $y = qr/c/i; # "c"
+\& my $z = qr/$x$y/; # "a|bc"
+.Ve
+.PP
+There's no solution for this problem other than making the custom
+engine understand a construct like \f(CW\*(C`(?:)\*(C'\fR.
+.ie n .SS """seen_evals"""
+.el .SS \f(CWseen_evals\fP
+.IX Subsection "seen_evals"
+This stores the number of eval groups in
+the pattern. This is used for security
+purposes when embedding compiled regexes into larger patterns with \f(CW\*(C`qr//\*(C'\fR.
+.ie n .SS """refcnt"""
+.el .SS \f(CWrefcnt\fP
+.IX Subsection "refcnt"
+The number of times the structure is referenced. When
+this falls to 0, the regexp is automatically freed
+by a call to \f(CW\*(C`pregfree\*(C'\fR. This should be set to 1 in
+each engine's "comp" routine.
+.SH HISTORY
+.IX Header "HISTORY"
+Originally part of perlreguts.
+.SH AUTHORS
+.IX Header "AUTHORS"
+Originally written by Yves Orton, expanded by Ævar Arnfjörð
+Bjarmason.
+.SH LICENSE
+.IX Header "LICENSE"
+Copyright 2006 Yves Orton and 2007 Ævar Arnfjörð Bjarmason.
+.PP
+This program is free software; you can redistribute it and/or modify it under
+the same terms as Perl itself.