diff options
Diffstat (limited to 'upstream/fedora-40/man1/perlreapi.1')
-rw-r--r-- | upstream/fedora-40/man1/perlreapi.1 | 925 |
1 files changed, 925 insertions, 0 deletions
diff --git a/upstream/fedora-40/man1/perlreapi.1 b/upstream/fedora-40/man1/perlreapi.1 new file mode 100644 index 00000000..05dee3ac --- /dev/null +++ b/upstream/fedora-40/man1/perlreapi.1 @@ -0,0 +1,925 @@ +.\" -*- mode: troff; coding: utf-8 -*- +.\" Automatically generated by Pod::Man 5.01 (Pod::Simple 3.43) +.\" +.\" Standard preamble: +.\" ======================================================================== +.de Sp \" Vertical space (when we can't use .PP) +.if t .sp .5v +.if n .sp +.. +.de Vb \" Begin verbatim text +.ft CW +.nf +.ne \\$1 +.. +.de Ve \" End verbatim text +.ft R +.fi +.. +.\" \*(C` and \*(C' are quotes in nroff, nothing in troff, for use with C<>. +.ie n \{\ +. ds C` "" +. ds C' "" +'br\} +.el\{\ +. ds C` +. ds C' +'br\} +.\" +.\" Escape single quotes in literal strings from groff's Unicode transform. +.ie \n(.g .ds Aq \(aq +.el .ds Aq ' +.\" +.\" If the F register is >0, we'll generate index entries on stderr for +.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index +.\" entries marked with X<> in POD. Of course, you'll have to process the +.\" output yourself in some meaningful fashion. +.\" +.\" Avoid warning from groff about undefined register 'F'. +.de IX +.. +.nr rF 0 +.if \n(.g .if rF .nr rF 1 +.if (\n(rF:(\n(.g==0)) \{\ +. if \nF \{\ +. de IX +. tm Index:\\$1\t\\n%\t"\\$2" +.. +. if !\nF==2 \{\ +. nr % 0 +. nr F 2 +. \} +. \} +.\} +.rr rF +.\" ======================================================================== +.\" +.IX Title "PERLREAPI 1" +.TH PERLREAPI 1 2024-01-25 "perl v5.38.2" "Perl Programmers Reference Guide" +.\" For nroff, turn off justification. Always turn off hyphenation; it makes +.\" way too many mistakes in technical documents. +.if n .ad l +.nh +.SH NAME +perlreapi \- Perl regular expression plugin interface +.SH DESCRIPTION +.IX Header "DESCRIPTION" +As of Perl 5.9.5 there is a new interface for plugging and using +regular expression engines other than the default one. +.PP +Each engine is supposed to provide access to a constant structure of the +following format: +.PP +.Vb 10 +\& typedef struct regexp_engine { +\& REGEXP* (*comp) (pTHX_ +\& const SV * const pattern, const U32 flags); +\& I32 (*exec) (pTHX_ +\& REGEXP * const rx, +\& char* stringarg, +\& char* strend, char* strbeg, +\& SSize_t minend, SV* sv, +\& void* data, U32 flags); +\& char* (*intuit) (pTHX_ +\& REGEXP * const rx, SV *sv, +\& const char * const strbeg, +\& char *strpos, char *strend, U32 flags, +\& struct re_scream_pos_data_s *data); +\& SV* (*checkstr) (pTHX_ REGEXP * const rx); +\& void (*free) (pTHX_ REGEXP * const rx); +\& void (*numbered_buff_FETCH) (pTHX_ +\& REGEXP * const rx, +\& const I32 paren, +\& SV * const sv); +\& void (*numbered_buff_STORE) (pTHX_ +\& REGEXP * const rx, +\& const I32 paren, +\& SV const * const value); +\& I32 (*numbered_buff_LENGTH) (pTHX_ +\& REGEXP * const rx, +\& const SV * const sv, +\& const I32 paren); +\& SV* (*named_buff) (pTHX_ +\& REGEXP * const rx, +\& SV * const key, +\& SV * const value, +\& U32 flags); +\& SV* (*named_buff_iter) (pTHX_ +\& REGEXP * const rx, +\& const SV * const lastkey, +\& const U32 flags); +\& SV* (*qr_package)(pTHX_ REGEXP * const rx); +\& #ifdef USE_ITHREADS +\& void* (*dupe) (pTHX_ REGEXP * const rx, CLONE_PARAMS *param); +\& #endif +\& REGEXP* (*op_comp) (...); +.Ve +.PP +When a regexp is compiled, its \f(CW\*(C`engine\*(C'\fR field is then set to point at +the appropriate structure, so that when it needs to be used Perl can find +the right routines to do so. +.PP +In order to install a new regexp handler, \f(CW$^H{regcomp}\fR is set +to an integer which (when casted appropriately) resolves to one of these +structures. When compiling, the \f(CW\*(C`comp\*(C'\fR method is executed, and the +resulting \f(CW\*(C`regexp\*(C'\fR structure's engine field is expected to point back at +the same structure. +.PP +The pTHX_ symbol in the definition is a macro used by Perl under threading +to provide an extra argument to the routine holding a pointer back to +the interpreter that is executing the regexp. So under threading all +routines get an extra argument. +.SH Callbacks +.IX Header "Callbacks" +.SS comp +.IX Subsection "comp" +.Vb 1 +\& REGEXP* comp(pTHX_ const SV * const pattern, const U32 flags); +.Ve +.PP +Compile the pattern stored in \f(CW\*(C`pattern\*(C'\fR using the given \f(CW\*(C`flags\*(C'\fR and +return a pointer to a prepared \f(CW\*(C`REGEXP\*(C'\fR structure that can perform +the match. See "The REGEXP structure" below for an explanation of +the individual fields in the REGEXP struct. +.PP +The \f(CW\*(C`pattern\*(C'\fR parameter is the scalar that was used as the +pattern. Previous versions of Perl would pass two \f(CW\*(C`char*\*(C'\fR indicating +the start and end of the stringified pattern; the following snippet can +be used to get the old parameters: +.PP +.Vb 3 +\& STRLEN plen; +\& char* exp = SvPV(pattern, plen); +\& char* xend = exp + plen; +.Ve +.PP +Since any scalar can be passed as a pattern, it's possible to implement +an engine that does something with an array (\f(CW\*(C`"ook" =~ [ qw/ eek +hlagh / ]\*(C'\fR) or with the non-stringified form of a compiled regular +expression (\f(CW\*(C`"ook" =~ qr/eek/\*(C'\fR). Perl's own engine will always +stringify everything using the snippet above, but that doesn't mean +other engines have to. +.PP +The \f(CW\*(C`flags\*(C'\fR parameter is a bitfield which indicates which of the +\&\f(CW\*(C`msixpn\*(C'\fR flags the regex was compiled with. It also contains +additional info, such as if \f(CW\*(C`use locale\*(C'\fR is in effect. +.PP +The \f(CW\*(C`eogc\*(C'\fR flags are stripped out before being passed to the comp +routine. The regex engine does not need to know if any of these +are set, as those flags should only affect what Perl does with the +pattern and its match variables, not how it gets compiled and +executed. +.PP +By the time the comp callback is called, some of these flags have +already had effect (noted below where applicable). However most of +their effect occurs after the comp callback has run, in routines that +read the \f(CW\*(C`rx\->extflags\*(C'\fR field which it populates. +.PP +In general the flags should be preserved in \f(CW\*(C`rx\->extflags\*(C'\fR after +compilation, although the regex engine might want to add or delete +some of them to invoke or disable some special behavior in Perl. The +flags along with any special behavior they cause are documented below: +.PP +The pattern modifiers: +.ie n .IP """/m"" \- RXf_PMf_MULTILINE" 4 +.el .IP "\f(CW/m\fR \- RXf_PMf_MULTILINE" 4 +.IX Item "/m - RXf_PMf_MULTILINE" +If this is in \f(CW\*(C`rx\->extflags\*(C'\fR it will be passed to +\&\f(CW\*(C`Perl_fbm_instr\*(C'\fR by \f(CW\*(C`pp_split\*(C'\fR which will treat the subject string +as a multi-line string. +.ie n .IP """/s"" \- RXf_PMf_SINGLELINE" 4 +.el .IP "\f(CW/s\fR \- RXf_PMf_SINGLELINE" 4 +.IX Item "/s - RXf_PMf_SINGLELINE" +.PD 0 +.ie n .IP """/i"" \- RXf_PMf_FOLD" 4 +.el .IP "\f(CW/i\fR \- RXf_PMf_FOLD" 4 +.IX Item "/i - RXf_PMf_FOLD" +.ie n .IP """/x"" \- RXf_PMf_EXTENDED" 4 +.el .IP "\f(CW/x\fR \- RXf_PMf_EXTENDED" 4 +.IX Item "/x - RXf_PMf_EXTENDED" +.PD +If present on a regex, \f(CW"#"\fR comments will be handled differently by the +tokenizer in some cases. +.Sp +TODO: Document those cases. +.ie n .IP """/p"" \- RXf_PMf_KEEPCOPY" 4 +.el .IP "\f(CW/p\fR \- RXf_PMf_KEEPCOPY" 4 +.IX Item "/p - RXf_PMf_KEEPCOPY" +TODO: Document this +.IP "Character set" 4 +.IX Item "Character set" +The character set rules are determined by an enum that is contained +in this field. This is still experimental and subject to change, but +the current interface returns the rules by use of the in-line function +\&\f(CW\*(C`get_regex_charset(const U32 flags)\*(C'\fR. The only currently documented +value returned from it is REGEX_LOCALE_CHARSET, which is set if +\&\f(CW\*(C`use locale\*(C'\fR is in effect. If present in \f(CW\*(C`rx\->extflags\*(C'\fR, +\&\f(CW\*(C`split\*(C'\fR will use the locale dependent definition of whitespace +when RXf_SKIPWHITE or RXf_WHITE is in effect. ASCII whitespace +is defined as per isSPACE, and by the internal +macros \f(CW\*(C`is_utf8_space\*(C'\fR under UTF\-8, and \f(CW\*(C`isSPACE_LC\*(C'\fR under \f(CW\*(C`use +locale\*(C'\fR. +.PP +Additional flags: +.IP RXf_SPLIT 4 +.IX Item "RXf_SPLIT" +This flag was removed in perl 5.18.0. \f(CW\*(C`split \*(Aq \*(Aq\*(C'\fR is now special-cased +solely in the parser. RXf_SPLIT is still #defined, so you can test for it. +This is how it used to work: +.Sp +If \f(CW\*(C`split\*(C'\fR is invoked as \f(CW\*(C`split \*(Aq \*(Aq\*(C'\fR or with no arguments (which +really means \f(CW\*(C`split(\*(Aq \*(Aq, $_)\*(C'\fR, see split), Perl will +set this flag. The regex engine can then check for it and set the +SKIPWHITE and WHITE extflags. To do this, the Perl engine does: +.Sp +.Vb 2 +\& if (flags & RXf_SPLIT && r\->prelen == 1 && r\->precomp[0] == \*(Aq \*(Aq) +\& r\->extflags |= (RXf_SKIPWHITE|RXf_WHITE); +.Ve +.PP +These flags can be set during compilation to enable optimizations in +the \f(CW\*(C`split\*(C'\fR operator. +.IP RXf_SKIPWHITE 4 +.IX Item "RXf_SKIPWHITE" +This flag was removed in perl 5.18.0. It is still #defined, so you can +set it, but doing so will have no effect. This is how it used to work: +.Sp +If the flag is present in \f(CW\*(C`rx\->extflags\*(C'\fR \f(CW\*(C`split\*(C'\fR will delete +whitespace from the start of the subject string before it's operated +on. What is considered whitespace depends on if the subject is a +UTF\-8 string and if the \f(CW\*(C`RXf_PMf_LOCALE\*(C'\fR flag is set. +.Sp +If RXf_WHITE is set in addition to this flag, \f(CW\*(C`split\*(C'\fR will behave like +\&\f(CW\*(C`split " "\*(C'\fR under the Perl engine. +.IP RXf_START_ONLY 4 +.IX Item "RXf_START_ONLY" +Tells the split operator to split the target string on newlines +(\f(CW\*(C`\en\*(C'\fR) without invoking the regex engine. +.Sp +Perl's engine sets this if the pattern is \f(CW\*(C`/^/\*(C'\fR (\f(CW\*(C`plen == 1 && *exp +== \*(Aq^\*(Aq\*(C'\fR), even under \f(CW\*(C`/^/s\*(C'\fR; see split. Of course a +different regex engine might want to use the same optimizations +with a different syntax. +.IP RXf_WHITE 4 +.IX Item "RXf_WHITE" +Tells the split operator to split the target string on whitespace +without invoking the regex engine. The definition of whitespace varies +depending on if the target string is a UTF\-8 string and on +if RXf_PMf_LOCALE is set. +.Sp +Perl's engine sets this flag if the pattern is \f(CW\*(C`\es+\*(C'\fR. +.IP RXf_NULL 4 +.IX Item "RXf_NULL" +Tells the split operator to split the target string on +characters. The definition of character varies depending on if +the target string is a UTF\-8 string. +.Sp +Perl's engine sets this flag on empty patterns, this optimization +makes \f(CW\*(C`split //\*(C'\fR much faster than it would otherwise be. It's even +faster than \f(CW\*(C`unpack\*(C'\fR. +.IP RXf_NO_INPLACE_SUBST 4 +.IX Item "RXf_NO_INPLACE_SUBST" +Added in perl 5.18.0, this flag indicates that a regular expression might +perform an operation that would interfere with inplace substitution. For +instance it might contain lookbehind, or assign to non-magical variables +(such as \f(CW$REGMARK\fR and \f(CW$REGERROR\fR) during matching. \f(CW\*(C`s///\*(C'\fR will skip +certain optimisations when this is set. +.SS exec +.IX Subsection "exec" +.Vb 4 +\& I32 exec(pTHX_ REGEXP * const rx, +\& char *stringarg, char* strend, char* strbeg, +\& SSize_t minend, SV* sv, +\& void* data, U32 flags); +.Ve +.PP +Execute a regexp. The arguments are +.IP rx 4 +.IX Item "rx" +The regular expression to execute. +.IP sv 4 +.IX Item "sv" +This is the SV to be matched against. Note that the +actual char array to be matched against is supplied by the arguments +described below; the SV is just used to determine UTF8ness, \f(CWpos()\fR etc. +.IP strbeg 4 +.IX Item "strbeg" +Pointer to the physical start of the string. +.IP strend 4 +.IX Item "strend" +Pointer to the character following the physical end of the string (i.e. +the \f(CW\*(C`\e0\*(C'\fR, if any). +.IP stringarg 4 +.IX Item "stringarg" +Pointer to the position in the string where matching should start; it might +not be equal to \f(CW\*(C`strbeg\*(C'\fR (for example in a later iteration of \f(CW\*(C`/.../g\*(C'\fR). +.IP minend 4 +.IX Item "minend" +Minimum length of string (measured in bytes from \f(CW\*(C`stringarg\*(C'\fR) that must +match; if the engine reaches the end of the match but hasn't reached this +position in the string, it should fail. +.IP data 4 +.IX Item "data" +Optimisation data; subject to change. +.IP flags 4 +.IX Item "flags" +Optimisation flags; subject to change. +.SS intuit +.IX Subsection "intuit" +.Vb 8 +\& char* intuit(pTHX_ +\& REGEXP * const rx, +\& SV *sv, +\& const char * const strbeg, +\& char *strpos, +\& char *strend, +\& const U32 flags, +\& struct re_scream_pos_data_s *data); +.Ve +.PP +Find the start position where a regex match should be attempted, +or possibly if the regex engine should not be run because the +pattern can't match. This is called, as appropriate, by the core, +depending on the values of the \f(CW\*(C`extflags\*(C'\fR member of the \f(CW\*(C`regexp\*(C'\fR +structure. +.PP +Arguments: +.PP +.Vb 11 +\& rx: the regex to match against +\& sv: the SV being matched: only used for utf8 flag; the string +\& itself is accessed via the pointers below. Note that on +\& something like an overloaded SV, SvPOK(sv) may be false +\& and the string pointers may point to something unrelated to +\& the SV itself. +\& strbeg: real beginning of string +\& strpos: the point in the string at which to begin matching +\& strend: pointer to the byte following the last char of the string +\& flags currently unused; set to 0 +\& data: currently unused; set to NULL +.Ve +.SS checkstr +.IX Subsection "checkstr" +.Vb 1 +\& SV* checkstr(pTHX_ REGEXP * const rx); +.Ve +.PP +Return a SV containing a string that must appear in the pattern. Used +by \f(CW\*(C`split\*(C'\fR for optimising matches. +.SS free +.IX Subsection "free" +.Vb 1 +\& void free(pTHX_ REGEXP * const rx); +.Ve +.PP +Called by Perl when it is freeing a regexp pattern so that the engine +can release any resources pointed to by the \f(CW\*(C`pprivate\*(C'\fR member of the +\&\f(CW\*(C`regexp\*(C'\fR structure. This is only responsible for freeing private data; +Perl will handle releasing anything else contained in the \f(CW\*(C`regexp\*(C'\fR structure. +.SS "Numbered capture callbacks" +.IX Subsection "Numbered capture callbacks" +Called to get/set the value of \f(CW\*(C`$\`\*(C'\fR, \f(CW\*(C`$\*(Aq\*(C'\fR, \f(CW$&\fR and their named +equivalents, ${^PREMATCH}, ${^POSTMATCH} and ${^MATCH}, as well as the +numbered capture groups (\f(CW$1\fR, \f(CW$2\fR, ...). +.PP +The \f(CW\*(C`paren\*(C'\fR parameter will be \f(CW1\fR for \f(CW$1\fR, \f(CW2\fR for \f(CW$2\fR and so +forth, and have these symbolic values for the special variables: +.PP +.Vb 6 +\& ${^PREMATCH} RX_BUFF_IDX_CARET_PREMATCH +\& ${^POSTMATCH} RX_BUFF_IDX_CARET_POSTMATCH +\& ${^MATCH} RX_BUFF_IDX_CARET_FULLMATCH +\& $\` RX_BUFF_IDX_PREMATCH +\& $\*(Aq RX_BUFF_IDX_POSTMATCH +\& $& RX_BUFF_IDX_FULLMATCH +.Ve +.PP +Note that in Perl 5.17.3 and earlier, the last three constants were also +used for the caret variants of the variables. +.PP +The names have been chosen by analogy with Tie::Scalar methods +names with an additional \fBLENGTH\fR callback for efficiency. However +named capture variables are currently not tied internally but +implemented via magic. +.PP +\fInumbered_buff_FETCH\fR +.IX Subsection "numbered_buff_FETCH" +.PP +.Vb 2 +\& void numbered_buff_FETCH(pTHX_ REGEXP * const rx, const I32 paren, +\& SV * const sv); +.Ve +.PP +Fetch a specified numbered capture. \f(CW\*(C`sv\*(C'\fR should be set to the scalar +to return, the scalar is passed as an argument rather than being +returned from the function because when it's called Perl already has a +scalar to store the value, creating another one would be +redundant. The scalar can be set with \f(CW\*(C`sv_setsv\*(C'\fR, \f(CW\*(C`sv_setpvn\*(C'\fR and +friends, see perlapi. +.PP +This callback is where Perl untaints its own capture variables under +taint mode (see perlsec). See the \f(CW\*(C`Perl_reg_numbered_buff_fetch\*(C'\fR +function in \fIregcomp.c\fR for how to untaint capture variables if +that's something you'd like your engine to do as well. +.PP +\fInumbered_buff_STORE\fR +.IX Subsection "numbered_buff_STORE" +.PP +.Vb 4 +\& void (*numbered_buff_STORE) (pTHX_ +\& REGEXP * const rx, +\& const I32 paren, +\& SV const * const value); +.Ve +.PP +Set the value of a numbered capture variable. \f(CW\*(C`value\*(C'\fR is the scalar +that is to be used as the new value. It's up to the engine to make +sure this is used as the new value (or reject it). +.PP +Example: +.PP +.Vb 4 +\& if ("ook" =~ /(o*)/) { +\& # \*(Aqparen\*(Aq will be \*(Aq1\*(Aq and \*(Aqvalue\*(Aq will be \*(Aqee\*(Aq +\& $1 =~ tr/o/e/; +\& } +.Ve +.PP +Perl's own engine will croak on any attempt to modify the capture +variables, to do this in another engine use the following callback +(copied from \f(CW\*(C`Perl_reg_numbered_buff_store\*(C'\fR): +.PP +.Vb 9 +\& void +\& Example_reg_numbered_buff_store(pTHX_ +\& REGEXP * const rx, +\& const I32 paren, +\& SV const * const value) +\& { +\& PERL_UNUSED_ARG(rx); +\& PERL_UNUSED_ARG(paren); +\& PERL_UNUSED_ARG(value); +\& +\& if (!PL_localizing) +\& Perl_croak(aTHX_ PL_no_modify); +\& } +.Ve +.PP +Actually Perl will not \fIalways\fR croak in a statement that looks +like it would modify a numbered capture variable. This is because the +STORE callback will not be called if Perl can determine that it +doesn't have to modify the value. This is exactly how tied variables +behave in the same situation: +.PP +.Vb 2 +\& package CaptureVar; +\& use parent \*(AqTie::Scalar\*(Aq; +\& +\& sub TIESCALAR { bless [] } +\& sub FETCH { undef } +\& sub STORE { die "This doesn\*(Aqt get called" } +\& +\& package main; +\& +\& tie my $sv => "CaptureVar"; +\& $sv =~ y/a/b/; +.Ve +.PP +Because \f(CW$sv\fR is \f(CW\*(C`undef\*(C'\fR when the \f(CW\*(C`y///\*(C'\fR operator is applied to it, +the transliteration won't actually execute and the program won't +\&\f(CW\*(C`die\*(C'\fR. This is different to how 5.8 and earlier versions behaved +since the capture variables were READONLY variables then; now they'll +just die when assigned to in the default engine. +.PP +\fInumbered_buff_LENGTH\fR +.IX Subsection "numbered_buff_LENGTH" +.PP +.Vb 4 +\& I32 numbered_buff_LENGTH (pTHX_ +\& REGEXP * const rx, +\& const SV * const sv, +\& const I32 paren); +.Ve +.PP +Get the \f(CW\*(C`length\*(C'\fR of a capture variable. There's a special callback +for this so that Perl doesn't have to do a FETCH and run \f(CW\*(C`length\*(C'\fR on +the result, since the length is (in Perl's case) known from an offset +stored in \f(CW\*(C`rx\->offs\*(C'\fR, this is much more efficient: +.PP +.Vb 3 +\& I32 s1 = rx\->offs[paren].start; +\& I32 s2 = rx\->offs[paren].end; +\& I32 len = t1 \- s1; +.Ve +.PP +This is a little bit more complex in the case of UTF\-8, see what +\&\f(CW\*(C`Perl_reg_numbered_buff_length\*(C'\fR does with +is_utf8_string_loclen. +.SS "Named capture callbacks" +.IX Subsection "Named capture callbacks" +Called to get/set the value of \f(CW\*(C`%+\*(C'\fR and \f(CW\*(C`%\-\*(C'\fR, as well as by some +utility functions in re. +.PP +There are two callbacks, \f(CW\*(C`named_buff\*(C'\fR is called in all the cases the +FETCH, STORE, DELETE, CLEAR, EXISTS and SCALAR Tie::Hash callbacks +would be on changes to \f(CW\*(C`%+\*(C'\fR and \f(CW\*(C`%\-\*(C'\fR and \f(CW\*(C`named_buff_iter\*(C'\fR in the +same cases as FIRSTKEY and NEXTKEY. +.PP +The \f(CW\*(C`flags\*(C'\fR parameter can be used to determine which of these +operations the callbacks should respond to. The following flags are +currently defined: +.PP +Which Tie::Hash operation is being performed from the Perl level on +\&\f(CW\*(C`%+\*(C'\fR or \f(CW\*(C`%+\*(C'\fR, if any: +.PP +.Vb 8 +\& RXapif_FETCH +\& RXapif_STORE +\& RXapif_DELETE +\& RXapif_CLEAR +\& RXapif_EXISTS +\& RXapif_SCALAR +\& RXapif_FIRSTKEY +\& RXapif_NEXTKEY +.Ve +.PP +If \f(CW\*(C`%+\*(C'\fR or \f(CW\*(C`%\-\*(C'\fR is being operated on, if any. +.PP +.Vb 2 +\& RXapif_ONE /* %+ */ +\& RXapif_ALL /* %\- */ +.Ve +.PP +If this is being called as \f(CW\*(C`re::regname\*(C'\fR, \f(CW\*(C`re::regnames\*(C'\fR or +\&\f(CW\*(C`re::regnames_count\*(C'\fR, if any. The first two will be combined with +\&\f(CW\*(C`RXapif_ONE\*(C'\fR or \f(CW\*(C`RXapif_ALL\*(C'\fR. +.PP +.Vb 3 +\& RXapif_REGNAME +\& RXapif_REGNAMES +\& RXapif_REGNAMES_COUNT +.Ve +.PP +Internally \f(CW\*(C`%+\*(C'\fR and \f(CW\*(C`%\-\*(C'\fR are implemented with a real tied interface +via Tie::Hash::NamedCapture. The methods in that package will call +back into these functions. However the usage of +Tie::Hash::NamedCapture for this purpose might change in future +releases. For instance this might be implemented by magic instead +(would need an extension to mgvtbl). +.PP +\fInamed_buff\fR +.IX Subsection "named_buff" +.PP +.Vb 2 +\& SV* (*named_buff) (pTHX_ REGEXP * const rx, SV * const key, +\& SV * const value, U32 flags); +.Ve +.PP +\fInamed_buff_iter\fR +.IX Subsection "named_buff_iter" +.PP +.Vb 4 +\& SV* (*named_buff_iter) (pTHX_ +\& REGEXP * const rx, +\& const SV * const lastkey, +\& const U32 flags); +.Ve +.SS qr_package +.IX Subsection "qr_package" +.Vb 1 +\& SV* qr_package(pTHX_ REGEXP * const rx); +.Ve +.PP +The package the qr// magic object is blessed into (as seen by \f(CW\*(C`ref +qr//\*(C'\fR). It is recommended that engines change this to their package +name for identification regardless of if they implement methods +on the object. +.PP +The package this method returns should also have the internal +\&\f(CW\*(C`Regexp\*(C'\fR package in its \f(CW@ISA\fR. \f(CW\*(C`qr//\->isa("Regexp")\*(C'\fR should always +be true regardless of what engine is being used. +.PP +Example implementation might be: +.PP +.Vb 6 +\& SV* +\& Example_qr_package(pTHX_ REGEXP * const rx) +\& { +\& PERL_UNUSED_ARG(rx); +\& return newSVpvs("re::engine::Example"); +\& } +.Ve +.PP +Any method calls on an object created with \f(CW\*(C`qr//\*(C'\fR will be dispatched to the +package as a normal object. +.PP +.Vb 3 +\& use re::engine::Example; +\& my $re = qr//; +\& $re\->meth; # dispatched to re::engine::Example::meth() +.Ve +.PP +To retrieve the \f(CW\*(C`REGEXP\*(C'\fR object from the scalar in an XS function use +the \f(CW\*(C`SvRX\*(C'\fR macro, see "REGEXP Functions" in perlapi. +.PP +.Vb 3 +\& void meth(SV * rv) +\& PPCODE: +\& REGEXP * re = SvRX(sv); +.Ve +.SS dupe +.IX Subsection "dupe" +.Vb 1 +\& void* dupe(pTHX_ REGEXP * const rx, CLONE_PARAMS *param); +.Ve +.PP +On threaded builds a regexp may need to be duplicated so that the pattern +can be used by multiple threads. This routine is expected to handle the +duplication of any private data pointed to by the \f(CW\*(C`pprivate\*(C'\fR member of +the \f(CW\*(C`regexp\*(C'\fR structure. It will be called with the preconstructed new +\&\f(CW\*(C`regexp\*(C'\fR structure as an argument, the \f(CW\*(C`pprivate\*(C'\fR member will point at +the \fBold\fR private structure, and it is this routine's responsibility to +construct a copy and return a pointer to it (which Perl will then use to +overwrite the field as passed to this routine.) +.PP +This allows the engine to dupe its private data but also if necessary +modify the final structure if it really must. +.PP +On unthreaded builds this field doesn't exist. +.SS op_comp +.IX Subsection "op_comp" +This is private to the Perl core and subject to change. Should be left +null. +.SH "The REGEXP structure" +.IX Header "The REGEXP structure" +The REGEXP struct is defined in \fIregexp.h\fR. +All regex engines must be able to +correctly build such a structure in their "comp" routine. +.PP +The REGEXP structure contains all the data that Perl needs to be aware of +to properly work with the regular expression. It includes data about +optimisations that Perl can use to determine if the regex engine should +really be used, and various other control info that is needed to properly +execute patterns in various contexts, such as if the pattern anchored in +some way, or what flags were used during the compile, or if the +program contains special constructs that Perl needs to be aware of. +.PP +In addition it contains two fields that are intended for the private +use of the regex engine that compiled the pattern. These are the +\&\f(CW\*(C`intflags\*(C'\fR and \f(CW\*(C`pprivate\*(C'\fR members. \f(CW\*(C`pprivate\*(C'\fR is a void pointer to +an arbitrary structure, whose use and management is the responsibility +of the compiling engine. Perl will never modify either of these +values. +.PP +.Vb 3 +\& typedef struct regexp { +\& /* what engine created this regexp? */ +\& const struct regexp_engine* engine; +\& +\& /* what re is this a lightweight copy of? */ +\& struct regexp* mother_re; +\& +\& /* Information about the match that the Perl core uses to manage +\& * things */ +\& U32 extflags; /* Flags used both externally and internally */ +\& I32 minlen; /* mininum possible number of chars in */ +\& string to match */ +\& I32 minlenret; /* mininum possible number of chars in $& */ +\& U32 gofs; /* chars left of pos that we search from */ +\& +\& /* substring data about strings that must appear +\& in the final match, used for optimisations */ +\& struct reg_substr_data *substrs; +\& +\& U32 nparens; /* number of capture groups */ +\& +\& /* private engine specific data */ +\& U32 intflags; /* Engine Specific Internal flags */ +\& void *pprivate; /* Data private to the regex engine which +\& created this object. */ +\& +\& /* Data about the last/current match. These are modified during +\& * matching*/ +\& U32 lastparen; /* highest close paren matched ($+) */ +\& U32 lastcloseparen; /* last close paren matched ($^N) */ +\& regexp_paren_pair *offs; /* Array of offsets for (@\-) and +\& (@+) */ +\& +\& char *subbeg; /* saved or original string so \edigit works +\& forever. */ +\& SV_SAVED_COPY /* If non\-NULL, SV which is COW from original */ +\& I32 sublen; /* Length of string pointed by subbeg */ +\& I32 suboffset; /* byte offset of subbeg from logical start of +\& str */ +\& I32 subcoffset; /* suboffset equiv, but in chars (for @\-/@+) */ +\& +\& /* Information about the match that isn\*(Aqt often used */ +\& I32 prelen; /* length of precomp */ +\& const char *precomp; /* pre\-compilation regular expression */ +\& +\& char *wrapped; /* wrapped version of the pattern */ +\& I32 wraplen; /* length of wrapped */ +\& +\& I32 seen_evals; /* number of eval groups in the pattern \- for +\& security checks */ +\& HV *paren_names; /* Optional hash of paren names */ +\& +\& /* Refcount of this regexp */ +\& I32 refcnt; /* Refcount of this regexp */ +\& } regexp; +.Ve +.PP +The fields are discussed in more detail below: +.ie n .SS """engine""" +.el .SS \f(CWengine\fP +.IX Subsection "engine" +This field points at a \f(CW\*(C`regexp_engine\*(C'\fR structure which contains pointers +to the subroutines that are to be used for performing a match. It +is the compiling routine's responsibility to populate this field before +returning the regexp object. +.PP +Internally this is set to \f(CW\*(C`NULL\*(C'\fR unless a custom engine is specified in +\&\f(CW$^H{regcomp}\fR, Perl's own set of callbacks can be accessed in the struct +pointed to by \f(CW\*(C`RE_ENGINE_PTR\*(C'\fR. +.ie n .SS """mother_re""" +.el .SS \f(CWmother_re\fP +.IX Subsection "mother_re" +TODO, see commit 28d8d7f41a. +.ie n .SS """extflags""" +.el .SS \f(CWextflags\fP +.IX Subsection "extflags" +This will be used by Perl to see what flags the regexp was compiled +with, this will normally be set to the value of the flags parameter by +the comp callback. See the comp documentation for +valid flags. +.ie n .SS """minlen"" ""minlenret""" +.el .SS "\f(CWminlen\fP \f(CWminlenret\fP" +.IX Subsection "minlen minlenret" +The minimum string length (in characters) required for the pattern to match. +This is used to +prune the search space by not bothering to match any closer to the end of a +string than would allow a match. For instance there is no point in even +starting the regex engine if the minlen is 10 but the string is only 5 +characters long. There is no way that the pattern can match. +.PP +\&\f(CW\*(C`minlenret\*(C'\fR is the minimum length (in characters) of the string that would +be found in $& after a match. +.PP +The difference between \f(CW\*(C`minlen\*(C'\fR and \f(CW\*(C`minlenret\*(C'\fR can be seen in the +following pattern: +.PP +.Vb 1 +\& /ns(?=\ed)/ +.Ve +.PP +where the \f(CW\*(C`minlen\*(C'\fR would be 3 but \f(CW\*(C`minlenret\*(C'\fR would only be 2 as the \ed is +required to match but is not actually +included in the matched content. This +distinction is particularly important as the substitution logic uses the +\&\f(CW\*(C`minlenret\*(C'\fR to tell if it can do in-place substitutions (these can +result in considerable speed-up). +.ie n .SS """gofs""" +.el .SS \f(CWgofs\fP +.IX Subsection "gofs" +Left offset from \fBpos()\fR to start match at. +.ie n .SS """substrs""" +.el .SS \f(CWsubstrs\fP +.IX Subsection "substrs" +Substring data about strings that must appear in the final match. This +is currently only used internally by Perl's engine, but might be +used in the future for all engines for optimisations. +.ie n .SS """nparens"", ""lastparen"", and ""lastcloseparen""" +.el .SS "\f(CWnparens\fP, \f(CWlastparen\fP, and \f(CWlastcloseparen\fP" +.IX Subsection "nparens, lastparen, and lastcloseparen" +These fields are used to keep track of: how many paren capture groups +there are in the pattern; which was the highest paren to be closed (see +"$+" in perlvar); and which was the most recent paren to be closed (see +"$^N" in perlvar). +.ie n .SS """intflags""" +.el .SS \f(CWintflags\fP +.IX Subsection "intflags" +The engine's private copy of the flags the pattern was compiled with. Usually +this is the same as \f(CW\*(C`extflags\*(C'\fR unless the engine chose to modify one of them. +.ie n .SS """pprivate""" +.el .SS \f(CWpprivate\fP +.IX Subsection "pprivate" +A void* pointing to an engine-defined +data structure. The Perl engine uses the +\&\f(CW\*(C`regexp_internal\*(C'\fR structure (see "Base Structures" in perlreguts) but a custom +engine should use something else. +.ie n .SS """offs""" +.el .SS \f(CWoffs\fP +.IX Subsection "offs" +A \f(CW\*(C`regexp_paren_pair\*(C'\fR structure which defines offsets into the string being +matched which correspond to the \f(CW$&\fR and \f(CW$1\fR, \f(CW$2\fR etc. captures, the +\&\f(CW\*(C`regexp_paren_pair\*(C'\fR struct is defined as follows: +.PP +.Vb 4 +\& typedef struct regexp_paren_pair { +\& I32 start; +\& I32 end; +\& } regexp_paren_pair; +.Ve +.PP +If \f(CW\*(C`\->offs[num].start\*(C'\fR or \f(CW\*(C`\->offs[num].end\*(C'\fR is \f(CW\-1\fR then that +capture group did not match. +\&\f(CW\*(C`\->offs[0].start/end\*(C'\fR represents \f(CW$&\fR (or +\&\f(CW\*(C`${^MATCH}\*(C'\fR under \f(CW\*(C`/p\*(C'\fR) and \f(CW\*(C`\->offs[paren].end\*(C'\fR matches \f(CW$$paren\fR where +\&\f(CW$paren \fR= 1>. +.ie n .SS """precomp"" ""prelen""" +.el .SS "\f(CWprecomp\fP \f(CWprelen\fP" +.IX Subsection "precomp prelen" +Used for optimisations. \f(CW\*(C`precomp\*(C'\fR holds a copy of the pattern that +was compiled and \f(CW\*(C`prelen\*(C'\fR its length. When a new pattern is to be +compiled (such as inside a loop) the internal \f(CW\*(C`regcomp\*(C'\fR operator +checks if the last compiled \f(CW\*(C`REGEXP\*(C'\fR's \f(CW\*(C`precomp\*(C'\fR and \f(CW\*(C`prelen\*(C'\fR +are equivalent to the new one, and if so uses the old pattern instead +of compiling a new one. +.PP +The relevant snippet from \f(CW\*(C`Perl_pp_regcomp\*(C'\fR: +.PP +.Vb 3 +\& if (!re || !re\->precomp || re\->prelen != (I32)len || +\& memNE(re\->precomp, t, len)) +\& /* Compile a new pattern */ +.Ve +.ie n .SS """paren_names""" +.el .SS \f(CWparen_names\fP +.IX Subsection "paren_names" +This is a hash used internally to track named capture groups and their +offsets. The keys are the names of the buffers the values are dualvars, +with the IV slot holding the number of buffers with the given name and the +pv being an embedded array of I32. The values may also be contained +independently in the data array in cases where named backreferences are +used. +.ie n .SS """substrs""" +.el .SS \f(CWsubstrs\fP +.IX Subsection "substrs" +Holds information on the longest string that must occur at a fixed +offset from the start of the pattern, and the longest string that must +occur at a floating offset from the start of the pattern. Used to do +Fast-Boyer-Moore searches on the string to find out if its worth using +the regex engine at all, and if so where in the string to search. +.ie n .SS """subbeg"" ""sublen"" ""saved_copy"" ""suboffset"" ""subcoffset""" +.el .SS "\f(CWsubbeg\fP \f(CWsublen\fP \f(CWsaved_copy\fP \f(CWsuboffset\fP \f(CWsubcoffset\fP" +.IX Subsection "subbeg sublen saved_copy suboffset subcoffset" +Used during the execution phase for managing search and replace patterns, +and for providing the text for \f(CW$&\fR, \f(CW$1\fR etc. \f(CW\*(C`subbeg\*(C'\fR points to a +buffer (either the original string, or a copy in the case of +\&\f(CWRX_MATCH_COPIED(rx)\fR), and \f(CW\*(C`sublen\*(C'\fR is the length of the buffer. The +\&\f(CW\*(C`RX_OFFS\*(C'\fR start and end indices index into this buffer. +.PP +In the presence of the \f(CW\*(C`REXEC_COPY_STR\*(C'\fR flag, but with the addition of +the \f(CW\*(C`REXEC_COPY_SKIP_PRE\*(C'\fR or \f(CW\*(C`REXEC_COPY_SKIP_POST\*(C'\fR flags, an engine +can choose not to copy the full buffer (although it must still do so in +the presence of \f(CW\*(C`RXf_PMf_KEEPCOPY\*(C'\fR or the relevant bits being set in +\&\f(CW\*(C`PL_sawampersand\*(C'\fR). In this case, it may set \f(CW\*(C`suboffset\*(C'\fR to indicate the +number of bytes from the logical start of the buffer to the physical start +(i.e. \f(CW\*(C`subbeg\*(C'\fR). It should also set \f(CW\*(C`subcoffset\*(C'\fR, the number of +characters in the offset. The latter is needed to support \f(CW\*(C`@\-\*(C'\fR and \f(CW\*(C`@+\*(C'\fR +which work in characters, not bytes. +.ie n .SS """wrapped"" ""wraplen""" +.el .SS "\f(CWwrapped\fP \f(CWwraplen\fP" +.IX Subsection "wrapped wraplen" +Stores the string \f(CW\*(C`qr//\*(C'\fR stringifies to. The Perl engine for example +stores \f(CW\*(C`(?^:eek)\*(C'\fR in the case of \f(CW\*(C`qr/eek/\*(C'\fR. +.PP +When using a custom engine that doesn't support the \f(CW\*(C`(?:)\*(C'\fR construct +for inline modifiers, it's probably best to have \f(CW\*(C`qr//\*(C'\fR stringify to +the supplied pattern, note that this will create undesired patterns in +cases such as: +.PP +.Vb 3 +\& my $x = qr/a|b/; # "a|b" +\& my $y = qr/c/i; # "c" +\& my $z = qr/$x$y/; # "a|bc" +.Ve +.PP +There's no solution for this problem other than making the custom +engine understand a construct like \f(CW\*(C`(?:)\*(C'\fR. +.ie n .SS """seen_evals""" +.el .SS \f(CWseen_evals\fP +.IX Subsection "seen_evals" +This stores the number of eval groups in +the pattern. This is used for security +purposes when embedding compiled regexes into larger patterns with \f(CW\*(C`qr//\*(C'\fR. +.ie n .SS """refcnt""" +.el .SS \f(CWrefcnt\fP +.IX Subsection "refcnt" +The number of times the structure is referenced. When +this falls to 0, the regexp is automatically freed +by a call to \f(CW\*(C`pregfree\*(C'\fR. This should be set to 1 in +each engine's "comp" routine. +.SH HISTORY +.IX Header "HISTORY" +Originally part of perlreguts. +.SH AUTHORS +.IX Header "AUTHORS" +Originally written by Yves Orton, expanded by Ævar Arnfjörð +Bjarmason. +.SH LICENSE +.IX Header "LICENSE" +Copyright 2006 Yves Orton and 2007 Ævar Arnfjörð Bjarmason. +.PP +This program is free software; you can redistribute it and/or modify it under +the same terms as Perl itself. |