diff options
Diffstat (limited to 'upstream/debian-bookworm/man1/perlfaq6.1')
-rw-r--r-- | upstream/debian-bookworm/man1/perlfaq6.1 | 1286 |
1 files changed, 1286 insertions, 0 deletions
diff --git a/upstream/debian-bookworm/man1/perlfaq6.1 b/upstream/debian-bookworm/man1/perlfaq6.1 new file mode 100644 index 00000000..dbb6bc2d --- /dev/null +++ b/upstream/debian-bookworm/man1/perlfaq6.1 @@ -0,0 +1,1286 @@ +.\" Automatically generated by Pod::Man 4.14 (Pod::Simple 3.43) +.\" +.\" Standard preamble: +.\" ======================================================================== +.de Sp \" Vertical space (when we can't use .PP) +.if t .sp .5v +.if n .sp +.. +.de Vb \" Begin verbatim text +.ft CW +.nf +.ne \\$1 +.. +.de Ve \" End verbatim text +.ft R +.fi +.. +.\" Set up some character translations and predefined strings. \*(-- will +.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left +.\" double quote, and \*(R" will give a right double quote. \*(C+ will +.\" give a nicer C++. Capital omega is used to do unbreakable dashes and +.\" therefore won't be available. \*(C` and \*(C' expand to `' in nroff, +.\" nothing in troff, for use with C<>. +.tr \(*W- +.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' +.ie n \{\ +. ds -- \(*W- +. ds PI pi +. if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch +. if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch +. ds L" "" +. ds R" "" +. ds C` "" +. ds C' "" +'br\} +.el\{\ +. ds -- \|\(em\| +. ds PI \(*p +. ds L" `` +. ds R" '' +. ds C` +. ds C' +'br\} +.\" +.\" Escape single quotes in literal strings from groff's Unicode transform. +.ie \n(.g .ds Aq \(aq +.el .ds Aq ' +.\" +.\" If the F register is >0, we'll generate index entries on stderr for +.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index +.\" entries marked with X<> in POD. Of course, you'll have to process the +.\" output yourself in some meaningful fashion. +.\" +.\" Avoid warning from groff about undefined register 'F'. +.de IX +.. +.nr rF 0 +.if \n(.g .if rF .nr rF 1 +.if (\n(rF:(\n(.g==0)) \{\ +. if \nF \{\ +. de IX +. tm Index:\\$1\t\\n%\t"\\$2" +.. +. if !\nF==2 \{\ +. nr % 0 +. nr F 2 +. \} +. \} +.\} +.rr rF +.\" ======================================================================== +.\" +.IX Title "PERLFAQ6 1" +.TH PERLFAQ6 1 "2023-11-25" "perl v5.36.0" "Perl Programmers Reference Guide" +.\" For nroff, turn off justification. Always turn off hyphenation; it makes +.\" way too many mistakes in technical documents. +.if n .ad l +.nh +.SH "NAME" +perlfaq6 \- Regular Expressions +.SH "VERSION" +.IX Header "VERSION" +version 5.20210520 +.SH "DESCRIPTION" +.IX Header "DESCRIPTION" +This section is surprisingly small because the rest of the \s-1FAQ\s0 is +littered with answers involving regular expressions. For example, +decoding a \s-1URL\s0 and checking whether something is a number can be handled +with regular expressions, but those answers are found elsewhere in +this document (in perlfaq9: \*(L"How do I decode or create those %\-encodings +on the web\*(R" and perlfaq4: \*(L"How do I determine whether a scalar is +a number/whole/integer/float\*(R", to be precise). +.SS "How can I hope to use regular expressions without creating illegible and unmaintainable code?" +.IX Xref "regex, legibility regexp, legibility regular expression, legibility x" +.IX Subsection "How can I hope to use regular expressions without creating illegible and unmaintainable code?" +Three techniques can make regular expressions maintainable and +understandable. +.IP "Comments Outside the Regex" 4 +.IX Item "Comments Outside the Regex" +Describe what you're doing and how you're doing it, using normal Perl +comments. +.Sp +.Vb 3 +\& # turn the line into the first word, a colon, and the +\& # number of characters on the rest of the line +\& s/^(\ew+)(.*)/ lc($1) . ":" . length($2) /meg; +.Ve +.IP "Comments Inside the Regex" 4 +.IX Item "Comments Inside the Regex" +The \f(CW\*(C`/x\*(C'\fR modifier causes whitespace to be ignored in a regex pattern +(except in a character class and a few other places), and also allows you to +use normal comments there, too. As you can imagine, whitespace and comments +help a lot. +.Sp +\&\f(CW\*(C`/x\*(C'\fR lets you turn this: +.Sp +.Vb 1 +\& s{<(?:[^>\*(Aq"]*|".*?"|\*(Aq.*?\*(Aq)+>}{}gs; +.Ve +.Sp +into this: +.Sp +.Vb 10 +\& s{ < # opening angle bracket +\& (?: # Non\-backreffing grouping paren +\& [^>\*(Aq"] * # 0 or more things that are neither > nor \*(Aq nor " +\& | # or else +\& ".*?" # a section between double quotes (stingy match) +\& | # or else +\& \*(Aq.*?\*(Aq # a section between single quotes (stingy match) +\& ) + # all occurring one or more times +\& > # closing angle bracket +\& }{}gsx; # replace with nothing, i.e. delete +.Ve +.Sp +It's still not quite so clear as prose, but it is very useful for +describing the meaning of each part of the pattern. +.IP "Different Delimiters" 4 +.IX Item "Different Delimiters" +While we normally think of patterns as being delimited with \f(CW\*(C`/\*(C'\fR +characters, they can be delimited by almost any character. perlre +describes this. For example, the \f(CW\*(C`s///\*(C'\fR above uses braces as +delimiters. Selecting another delimiter can avoid quoting the +delimiter within the pattern: +.Sp +.Vb 2 +\& s/\e/usr\e/local/\e/usr\e/share/g; # bad delimiter choice +\& s#/usr/local#/usr/share#g; # better +.Ve +.Sp +Using logically paired delimiters can be even more readable: +.Sp +.Vb 1 +\& s{/usr/local/}{/usr/share}g; # better still +.Ve +.SS "I'm having trouble matching over more than one line. What's wrong?" +.IX Xref "regex, multiline regexp, multiline regular expression, multiline" +.IX Subsection "I'm having trouble matching over more than one line. What's wrong?" +Either you don't have more than one line in the string you're looking +at (probably), or else you aren't using the correct modifier(s) on +your pattern (possibly). +.PP +There are many ways to get multiline data into a string. If you want +it to happen automatically while reading input, you'll want to set $/ +(probably to '' for paragraphs or \f(CW\*(C`undef\*(C'\fR for the whole file) to +allow you to read more than one line at a time. +.PP +Read perlre to help you decide which of \f(CW\*(C`/s\*(C'\fR and \f(CW\*(C`/m\*(C'\fR (or both) +you might want to use: \f(CW\*(C`/s\*(C'\fR allows dot to include newline, and \f(CW\*(C`/m\*(C'\fR +allows caret and dollar to match next to a newline, not just at the +end of the string. You do need to make sure that you've actually +got a multiline string in there. +.PP +For example, this program detects duplicate words, even when they span +line breaks (but not paragraph ones). For this example, we don't need +\&\f(CW\*(C`/s\*(C'\fR because we aren't using dot in a regular expression that we want +to cross line boundaries. Neither do we need \f(CW\*(C`/m\*(C'\fR because we don't +want caret or dollar to match at any point inside the record next +to newlines. But it's imperative that $/ be set to something other +than the default, or else we won't actually ever have a multiline +record read in. +.PP +.Vb 6 +\& $/ = \*(Aq\*(Aq; # read in whole paragraph, not just one line +\& while ( <> ) { +\& while ( /\eb([\ew\*(Aq\-]+)(\es+\eg1)+\eb/gi ) { # word starts alpha +\& print "Duplicate $1 at paragraph $.\en"; +\& } +\& } +.Ve +.PP +Here's some code that finds sentences that begin with \*(L"From \*(R" (which would +be mangled by many mailers): +.PP +.Vb 6 +\& $/ = \*(Aq\*(Aq; # read in whole paragraph, not just one line +\& while ( <> ) { +\& while ( /^From /gm ) { # /m makes ^ match next to \en +\& print "leading From in paragraph $.\en"; +\& } +\& } +.Ve +.PP +Here's code that finds everything between \s-1START\s0 and \s-1END\s0 in a paragraph: +.PP +.Vb 6 +\& undef $/; # read in whole file, not just one line or paragraph +\& while ( <> ) { +\& while ( /START(.*?)END/sgm ) { # /s makes . cross line boundaries +\& print "$1\en"; +\& } +\& } +.Ve +.SS "How can I pull out lines between two patterns that are themselves on different lines?" +.IX Xref ".." +.IX Subsection "How can I pull out lines between two patterns that are themselves on different lines?" +You can use Perl's somewhat exotic \f(CW\*(C`..\*(C'\fR operator (documented in +perlop): +.PP +.Vb 1 +\& perl \-ne \*(Aqprint if /START/ .. /END/\*(Aq file1 file2 ... +.Ve +.PP +If you wanted text and not lines, you would use +.PP +.Vb 1 +\& perl \-0777 \-ne \*(Aqprint "$1\en" while /START(.*?)END/gs\*(Aq file1 file2 ... +.Ve +.PP +But if you want nested occurrences of \f(CW\*(C`START\*(C'\fR through \f(CW\*(C`END\*(C'\fR, you'll +run up against the problem described in the question in this section +on matching balanced text. +.PP +Here's another example of using \f(CW\*(C`..\*(C'\fR: +.PP +.Vb 7 +\& while (<>) { +\& my $in_header = 1 .. /^$/; +\& my $in_body = /^$/ .. eof; +\& # now choose between them +\& } continue { +\& $. = 0 if eof; # fix $. +\& } +.Ve +.SS "How do I match \s-1XML, HTML,\s0 or other nasty, ugly things with a regex?" +.IX Xref "regex, XML regex, HTML XML HTML pain frustration sucking out, will to live" +.IX Subsection "How do I match XML, HTML, or other nasty, ugly things with a regex?" +Do not use regexes. Use a module and forget about the +regular expressions. The XML::LibXML, HTML::TokeParser and +HTML::TreeBuilder modules are good starts, although each namespace +has other parsing modules specialized for certain tasks and different +ways of doing it. Start at \s-1CPAN\s0 Search ( <http://metacpan.org/> ) +and wonder at all the work people have done for you already! :) +.SS "I put a regular expression into $/ but it didn't work. What's wrong?" +.IX Xref "$ , regexes in $INPUT_RECORD_SEPARATOR, regexes in $RS, regexes in" +.IX Subsection "I put a regular expression into $/ but it didn't work. What's wrong?" +$/ has to be a string. You can use these examples if you really need to +do this. +.PP +If you have File::Stream, this is easy. +.PP +.Vb 1 +\& use File::Stream; +\& +\& my $stream = File::Stream\->new( +\& $filehandle, +\& separator => qr/\es*,\es*/, +\& ); +\& +\& print "$_\en" while <$stream>; +.Ve +.PP +If you don't have File::Stream, you have to do a little more work. +.PP +You can use the four-argument form of sysread to continually add to +a buffer. After you add to the buffer, you check if you have a +complete line (using your regular expression). +.PP +.Vb 7 +\& local $_ = ""; +\& while( sysread FH, $_, 8192, length ) { +\& while( s/^((?s).*?)your_pattern// ) { +\& my $record = $1; +\& # do stuff here. +\& } +\& } +.Ve +.PP +You can do the same thing with foreach and a match using the +c flag and the \eG anchor, if you do not mind your entire file +being in memory at the end. +.PP +.Vb 7 +\& local $_ = ""; +\& while( sysread FH, $_, 8192, length ) { +\& foreach my $record ( m/\eG((?s).*?)your_pattern/gc ) { +\& # do stuff here. +\& } +\& substr( $_, 0, pos ) = "" if pos; +\& } +.Ve +.SS "How do I substitute case-insensitively on the \s-1LHS\s0 while preserving case on the \s-1RHS\s0?" +.IX Xref "replace, case preserving substitute, case preserving substitution, case preserving s, case preserving" +.IX Subsection "How do I substitute case-insensitively on the LHS while preserving case on the RHS?" +Here's a lovely Perlish solution by Larry Rosler. It exploits +properties of bitwise xor on \s-1ASCII\s0 strings. +.PP +.Vb 1 +\& $_= "this is a TEsT case"; +\& +\& $old = \*(Aqtest\*(Aq; +\& $new = \*(Aqsuccess\*(Aq; +\& +\& s{(\eQ$old\eE)} +\& { uc $new | (uc $1 ^ $1) . +\& (uc(substr $1, \-1) ^ substr $1, \-1) x +\& (length($new) \- length $1) +\& }egi; +\& +\& print; +.Ve +.PP +And here it is as a subroutine, modeled after the above: +.PP +.Vb 3 +\& sub preserve_case { +\& my ($old, $new) = @_; +\& my $mask = uc $old ^ $old; +\& +\& uc $new | $mask . +\& substr($mask, \-1) x (length($new) \- length($old)) +\& } +\& +\& $string = "this is a TEsT case"; +\& $string =~ s/(test)/preserve_case($1, "success")/egi; +\& print "$string\en"; +.Ve +.PP +This prints: +.PP +.Vb 1 +\& this is a SUcCESS case +.Ve +.PP +As an alternative, to keep the case of the replacement word if it is +longer than the original, you can use this code, by Jeff Pinyan: +.PP +.Vb 3 +\& sub preserve_case { +\& my ($from, $to) = @_; +\& my ($lf, $lt) = map length, @_; +\& +\& if ($lt < $lf) { $from = substr $from, 0, $lt } +\& else { $from .= substr $to, $lf } +\& +\& return uc $to | ($from ^ uc $from); +\& } +.Ve +.PP +This changes the sentence to \*(L"this is a SUcCess case.\*(R" +.PP +Just to show that C programmers can write C in any programming language, +if you prefer a more C\-like solution, the following script makes the +substitution have the same case, letter by letter, as the original. +(It also happens to run about 240% slower than the Perlish solution runs.) +If the substitution has more characters than the string being substituted, +the case of the last character is used for the rest of the substitution. +.PP +.Vb 8 +\& # Original by Nathan Torkington, massaged by Jeffrey Friedl +\& # +\& sub preserve_case +\& { +\& my ($old, $new) = @_; +\& my $state = 0; # 0 = no change; 1 = lc; 2 = uc +\& my ($i, $oldlen, $newlen, $c) = (0, length($old), length($new)); +\& my $len = $oldlen < $newlen ? $oldlen : $newlen; +\& +\& for ($i = 0; $i < $len; $i++) { +\& if ($c = substr($old, $i, 1), $c =~ /[\eW\ed_]/) { +\& $state = 0; +\& } elsif (lc $c eq $c) { +\& substr($new, $i, 1) = lc(substr($new, $i, 1)); +\& $state = 1; +\& } else { +\& substr($new, $i, 1) = uc(substr($new, $i, 1)); +\& $state = 2; +\& } +\& } +\& # finish up with any remaining new (for when new is longer than old) +\& if ($newlen > $oldlen) { +\& if ($state == 1) { +\& substr($new, $oldlen) = lc(substr($new, $oldlen)); +\& } elsif ($state == 2) { +\& substr($new, $oldlen) = uc(substr($new, $oldlen)); +\& } +\& } +\& return $new; +\& } +.Ve +.ie n .SS "How can I make ""\ew"" match national character sets?" +.el .SS "How can I make \f(CW\ew\fP match national character sets?" +.IX Xref "\\w" +.IX Subsection "How can I make w match national character sets?" +Put \f(CW\*(C`use locale;\*(C'\fR in your script. The \ew character class is taken +from the current locale. +.PP +See perllocale for details. +.ie n .SS "How can I match a locale-smart version of ""/[a\-zA\-Z]/""?" +.el .SS "How can I match a locale-smart version of \f(CW/[a\-zA\-Z]/\fP?" +.IX Xref "alpha" +.IX Subsection "How can I match a locale-smart version of /[a-zA-Z]/?" +You can use the \s-1POSIX\s0 character class syntax \f(CW\*(C`/[[:alpha:]]/\*(C'\fR +documented in perlre. +.PP +No matter which locale you are in, the alphabetic characters are +the characters in \ew without the digits and the underscore. +As a regex, that looks like \f(CW\*(C`/[^\eW\ed_]/\*(C'\fR. Its complement, +the non-alphabetics, is then everything in \eW along with +the digits and the underscore, or \f(CW\*(C`/[\eW\ed_]/\*(C'\fR. +.SS "How can I quote a variable to use in a regex?" +.IX Xref "regex, escaping regexp, escaping regular expression, escaping" +.IX Subsection "How can I quote a variable to use in a regex?" +The Perl parser will expand \f(CW$variable\fR and \f(CW@variable\fR references in +regular expressions unless the delimiter is a single quote. Remember, +too, that the right-hand side of a \f(CW\*(C`s///\*(C'\fR substitution is considered +a double-quoted string (see perlop for more details). Remember +also that any regex special characters will be acted on unless you +precede the substitution with \eQ. Here's an example: +.PP +.Vb 2 +\& $string = "Placido P. Octopus"; +\& $regex = "P."; +\& +\& $string =~ s/$regex/Polyp/; +\& # $string is now "Polypacido P. Octopus" +.Ve +.PP +Because \f(CW\*(C`.\*(C'\fR is special in regular expressions, and can match any +single character, the regex \f(CW\*(C`P.\*(C'\fR here has matched the <Pl> in the +original string. +.PP +To escape the special meaning of \f(CW\*(C`.\*(C'\fR, we use \f(CW\*(C`\eQ\*(C'\fR: +.PP +.Vb 2 +\& $string = "Placido P. Octopus"; +\& $regex = "P."; +\& +\& $string =~ s/\eQ$regex/Polyp/; +\& # $string is now "Placido Polyp Octopus" +.Ve +.PP +The use of \f(CW\*(C`\eQ\*(C'\fR causes the \f(CW\*(C`.\*(C'\fR in the regex to be treated as a +regular character, so that \f(CW\*(C`P.\*(C'\fR matches a \f(CW\*(C`P\*(C'\fR followed by a dot. +.ie n .SS "What is ""/o"" really for?" +.el .SS "What is \f(CW/o\fP really for?" +.IX Xref " o, regular expressions compile, regular expressions" +.IX Subsection "What is /o really for?" +(contributed by brian d foy) +.PP +The \f(CW\*(C`/o\*(C'\fR option for regular expressions (documented in perlop and +perlreref) tells Perl to compile the regular expression only once. +This is only useful when the pattern contains a variable. Perls 5.6 +and later handle this automatically if the pattern does not change. +.PP +Since the match operator \f(CW\*(C`m//\*(C'\fR, the substitution operator \f(CW\*(C`s///\*(C'\fR, +and the regular expression quoting operator \f(CW\*(C`qr//\*(C'\fR are double-quotish +constructs, you can interpolate variables into the pattern. See the +answer to \*(L"How can I quote a variable to use in a regex?\*(R" for more +details. +.PP +This example takes a regular expression from the argument list and +prints the lines of input that match it: +.PP +.Vb 1 +\& my $pattern = shift @ARGV; +\& +\& while( <> ) { +\& print if m/$pattern/; +\& } +.Ve +.PP +Versions of Perl prior to 5.6 would recompile the regular expression +for each iteration, even if \f(CW$pattern\fR had not changed. The \f(CW\*(C`/o\*(C'\fR +would prevent this by telling Perl to compile the pattern the first +time, then reuse that for subsequent iterations: +.PP +.Vb 1 +\& my $pattern = shift @ARGV; +\& +\& while( <> ) { +\& print if m/$pattern/o; # useful for Perl < 5.6 +\& } +.Ve +.PP +In versions 5.6 and later, Perl won't recompile the regular expression +if the variable hasn't changed, so you probably don't need the \f(CW\*(C`/o\*(C'\fR +option. It doesn't hurt, but it doesn't help either. If you want any +version of Perl to compile the regular expression only once even if +the variable changes (thus, only using its initial value), you still +need the \f(CW\*(C`/o\*(C'\fR. +.PP +You can watch Perl's regular expression engine at work to verify for +yourself if Perl is recompiling a regular expression. The \f(CW\*(C`use re +\&\*(Aqdebug\*(Aq\*(C'\fR pragma (comes with Perl 5.005 and later) shows the details. +With Perls before 5.6, you should see \f(CW\*(C`re\*(C'\fR reporting that its +compiling the regular expression on each iteration. With Perl 5.6 or +later, you should only see \f(CW\*(C`re\*(C'\fR report that for the first iteration. +.PP +.Vb 1 +\& use re \*(Aqdebug\*(Aq; +\& +\& my $regex = \*(AqPerl\*(Aq; +\& foreach ( qw(Perl Java Ruby Python) ) { +\& print STDERR "\-" x 73, "\en"; +\& print STDERR "Trying $_...\en"; +\& print STDERR "\et$_ is good!\en" if m/$regex/; +\& } +.Ve +.SS "How do I use a regular expression to strip C\-style comments from a file?" +.IX Subsection "How do I use a regular expression to strip C-style comments from a file?" +While this actually can be done, it's much harder than you'd think. +For example, this one-liner +.PP +.Vb 1 +\& perl \-0777 \-pe \*(Aqs{/\e*.*?\e*/}{}gs\*(Aq foo.c +.Ve +.PP +will work in many but not all cases. You see, it's too simple-minded for +certain kinds of C programs, in particular, those with what appear to be +comments in quoted strings. For that, you'd need something like this, +created by Jeffrey Friedl and later modified by Fred Curtis. +.PP +.Vb 4 +\& $/ = undef; +\& $_ = <>; +\& s#/\e*[^*]*\e*+([^/*][^*]*\e*+)*/|("(\e\e.|[^"\e\e])*"|\*(Aq(\e\e.|[^\*(Aq\e\e])*\*(Aq|.[^/"\*(Aq\e\e]*)#defined $2 ? $2 : ""#gse; +\& print; +.Ve +.PP +This could, of course, be more legibly written with the \f(CW\*(C`/x\*(C'\fR modifier, adding +whitespace and comments. Here it is expanded, courtesy of Fred Curtis. +.PP +.Vb 8 +\& s{ +\& /\e* ## Start of /* ... */ comment +\& [^*]*\e*+ ## Non\-* followed by 1\-or\-more *\*(Aqs +\& ( +\& [^/*][^*]*\e*+ +\& )* ## 0\-or\-more things which don\*(Aqt start with / +\& ## but do end with \*(Aq*\*(Aq +\& / ## End of /* ... */ comment +\& +\& | ## OR various things which aren\*(Aqt comments: +\& +\& ( +\& " ## Start of " ... " string +\& ( +\& \e\e. ## Escaped char +\& | ## OR +\& [^"\e\e] ## Non "\e +\& )* +\& " ## End of " ... " string +\& +\& | ## OR +\& +\& \*(Aq ## Start of \*(Aq ... \*(Aq string +\& ( +\& \e\e. ## Escaped char +\& | ## OR +\& [^\*(Aq\e\e] ## Non \*(Aq\e +\& )* +\& \*(Aq ## End of \*(Aq ... \*(Aq string +\& +\& | ## OR +\& +\& . ## Anything other char +\& [^/"\*(Aq\e\e]* ## Chars which doesn\*(Aqt start a comment, string or escape +\& ) +\& }{defined $2 ? $2 : ""}gxse; +.Ve +.PP +A slight modification also removes \*(C+ comments, possibly spanning multiple lines +using a continuation character: +.PP +.Vb 1 +\& s#/\e*[^*]*\e*+([^/*][^*]*\e*+)*/|//([^\e\e]|[^\en][\en]?)*?\en|("(\e\e.|[^"\e\e])*"|\*(Aq(\e\e.|[^\*(Aq\e\e])*\*(Aq|.[^/"\*(Aq\e\e]*)#defined $3 ? $3 : ""#gse; +.Ve +.SS "Can I use Perl regular expressions to match balanced text?" +.IX Xref "regex, matching balanced test regexp, matching balanced test regular expression, matching balanced test possessive PARNO Text::Balanced Regexp::Common backtracking recursion" +.IX Subsection "Can I use Perl regular expressions to match balanced text?" +(contributed by brian d foy) +.PP +Your first try should probably be the Text::Balanced module, which +is in the Perl standard library since Perl 5.8. It has a variety of +functions to deal with tricky text. The Regexp::Common module can +also help by providing canned patterns you can use. +.PP +As of Perl 5.10, you can match balanced text with regular expressions +using recursive patterns. Before Perl 5.10, you had to resort to +various tricks such as using Perl code in \f(CW\*(C`(??{})\*(C'\fR sequences. +.PP +Here's an example using a recursive regular expression. The goal is to +capture all of the text within angle brackets, including the text in +nested angle brackets. This sample text has two \*(L"major\*(R" groups: a +group with one level of nesting and a group with two levels of +nesting. There are five total groups in angle brackets: +.PP +.Vb 3 +\& I have some <brackets in <nested brackets> > and +\& <another group <nested once <nested twice> > > +\& and that\*(Aqs it. +.Ve +.PP +The regular expression to match the balanced text uses two new (to +Perl 5.10) regular expression features. These are covered in perlre +and this example is a modified version of one in that documentation. +.PP +First, adding the new possessive \f(CW\*(C`+\*(C'\fR to any quantifier finds the +longest match and does not backtrack. That's important since you want +to handle any angle brackets through the recursion, not backtracking. +The group \f(CW\*(C`[^<>]++\*(C'\fR finds one or more non-angle brackets without +backtracking. +.PP +Second, the new \f(CW\*(C`(?PARNO)\*(C'\fR refers to the sub-pattern in the +particular capture group given by \f(CW\*(C`PARNO\*(C'\fR. In the following regex, +the first capture group finds (and remembers) the balanced text, and +you need that same pattern within the first buffer to get past the +nested text. That's the recursive part. The \f(CW\*(C`(?1)\*(C'\fR uses the pattern +in the outer capture group as an independent part of the regex. +.PP +Putting it all together, you have: +.PP +.Vb 1 +\& #!/usr/local/bin/perl5.10.0 +\& +\& my $string =<<"HERE"; +\& I have some <brackets in <nested brackets> > and +\& <another group <nested once <nested twice> > > +\& and that\*(Aqs it. +\& HERE +\& +\& my @groups = $string =~ m/ +\& ( # start of capture group 1 +\& < # match an opening angle bracket +\& (?: +\& [^<>]++ # one or more non angle brackets, non backtracking +\& | +\& (?1) # found < or >, so recurse to capture group 1 +\& )* +\& > # match a closing angle bracket +\& ) # end of capture group 1 +\& /xg; +\& +\& $" = "\en\et"; +\& print "Found:\en\et@groups\en"; +.Ve +.PP +The output shows that Perl found the two major groups: +.PP +.Vb 3 +\& Found: +\& <brackets in <nested brackets> > +\& <another group <nested once <nested twice> > > +.Ve +.PP +With a little extra work, you can get all of the groups in angle +brackets even if they are in other angle brackets too. Each time you +get a balanced match, remove its outer delimiter (that's the one you +just matched so don't match it again) and add it to a queue of strings +to process. Keep doing that until you get no matches: +.PP +.Vb 1 +\& #!/usr/local/bin/perl5.10.0 +\& +\& my @queue =<<"HERE"; +\& I have some <brackets in <nested brackets> > and +\& <another group <nested once <nested twice> > > +\& and that\*(Aqs it. +\& HERE +\& +\& my $regex = qr/ +\& ( # start of bracket 1 +\& < # match an opening angle bracket +\& (?: +\& [^<>]++ # one or more non angle brackets, non backtracking +\& | +\& (?1) # recurse to bracket 1 +\& )* +\& > # match a closing angle bracket +\& ) # end of bracket 1 +\& /x; +\& +\& $" = "\en\et"; +\& +\& while( @queue ) { +\& my $string = shift @queue; +\& +\& my @groups = $string =~ m/$regex/g; +\& print "Found:\en\et@groups\en\en" if @groups; +\& +\& unshift @queue, map { s/^<//; s/>$//; $_ } @groups; +\& } +.Ve +.PP +The output shows all of the groups. The outermost matches show up +first and the nested matches show up later: +.PP +.Vb 3 +\& Found: +\& <brackets in <nested brackets> > +\& <another group <nested once <nested twice> > > +\& +\& Found: +\& <nested brackets> +\& +\& Found: +\& <nested once <nested twice> > +\& +\& Found: +\& <nested twice> +.Ve +.SS "What does it mean that regexes are greedy? How can I get around it?" +.IX Xref "greedy greediness" +.IX Subsection "What does it mean that regexes are greedy? How can I get around it?" +Most people mean that greedy regexes match as much as they can. +Technically speaking, it's actually the quantifiers (\f(CW\*(C`?\*(C'\fR, \f(CW\*(C`*\*(C'\fR, \f(CW\*(C`+\*(C'\fR, +\&\f(CW\*(C`{}\*(C'\fR) that are greedy rather than the whole pattern; Perl prefers local +greed and immediate gratification to overall greed. To get non-greedy +versions of the same quantifiers, use (\f(CW\*(C`??\*(C'\fR, \f(CW\*(C`*?\*(C'\fR, \f(CW\*(C`+?\*(C'\fR, \f(CW\*(C`{}?\*(C'\fR). +.PP +An example: +.PP +.Vb 3 +\& my $s1 = my $s2 = "I am very very cold"; +\& $s1 =~ s/ve.*y //; # I am cold +\& $s2 =~ s/ve.*?y //; # I am very cold +.Ve +.PP +Notice how the second substitution stopped matching as soon as it +encountered \*(L"y \*(R". The \f(CW\*(C`*?\*(C'\fR quantifier effectively tells the regular +expression engine to find a match as quickly as possible and pass +control on to whatever is next in line, as you would if you were +playing hot potato. +.SS "How do I process each word on each line?" +.IX Xref "word" +.IX Subsection "How do I process each word on each line?" +Use the split function: +.PP +.Vb 5 +\& while (<>) { +\& foreach my $word ( split ) { +\& # do something with $word here +\& } +\& } +.Ve +.PP +Note that this isn't really a word in the English sense; it's just +chunks of consecutive non-whitespace characters. +.PP +To work with only alphanumeric sequences (including underscores), you +might consider +.PP +.Vb 5 +\& while (<>) { +\& foreach $word (m/(\ew+)/g) { +\& # do something with $word here +\& } +\& } +.Ve +.SS "How can I print out a word-frequency or line-frequency summary?" +.IX Subsection "How can I print out a word-frequency or line-frequency summary?" +To do this, you have to parse out each word in the input stream. We'll +pretend that by word you mean chunk of alphabetics, hyphens, or +apostrophes, rather than the non-whitespace chunk idea of a word given +in the previous question: +.PP +.Vb 6 +\& my (%seen); +\& while (<>) { +\& while ( /(\eb[^\eW_\ed][\ew\*(Aq\-]+\eb)/g ) { # misses "\`sheep\*(Aq" +\& $seen{$1}++; +\& } +\& } +\& +\& while ( my ($word, $count) = each %seen ) { +\& print "$count $word\en"; +\& } +.Ve +.PP +If you wanted to do the same thing for lines, you wouldn't need a +regular expression: +.PP +.Vb 1 +\& my (%seen); +\& +\& while (<>) { +\& $seen{$_}++; +\& } +\& +\& while ( my ($line, $count) = each %seen ) { +\& print "$count $line"; +\& } +.Ve +.PP +If you want these output in a sorted order, see perlfaq4: \*(L"How do I +sort a hash (optionally by value instead of key)?\*(R". +.SS "How can I do approximate matching?" +.IX Xref "match, approximate matching, approximate" +.IX Subsection "How can I do approximate matching?" +See the module String::Approx available from \s-1CPAN.\s0 +.SS "How do I efficiently match many regular expressions at once?" +.IX Xref "regex, efficiency regexp, efficiency regular expression, efficiency" +.IX Subsection "How do I efficiently match many regular expressions at once?" +(contributed by brian d foy) +.PP +You want to +avoid compiling a regular expression every time you want to match it. +In this example, perl must recompile the regular expression for every +iteration of the \f(CW\*(C`foreach\*(C'\fR loop since \f(CW$pattern\fR can change: +.PP +.Vb 1 +\& my @patterns = qw( fo+ ba[rz] ); +\& +\& LINE: while( my $line = <> ) { +\& foreach my $pattern ( @patterns ) { +\& if( $line =~ m/\eb$pattern\eb/i ) { +\& print $line; +\& next LINE; +\& } +\& } +\& } +.Ve +.PP +The \f(CW\*(C`qr//\*(C'\fR operator compiles a regular +expression, but doesn't apply it. When you use the pre-compiled +version of the regex, perl does less work. In this example, I inserted +a \f(CW\*(C`map\*(C'\fR to turn each pattern into its pre-compiled form. The rest of +the script is the same, but faster: +.PP +.Vb 1 +\& my @patterns = map { qr/\eb$_\eb/i } qw( fo+ ba[rz] ); +\& +\& LINE: while( my $line = <> ) { +\& foreach my $pattern ( @patterns ) { +\& if( $line =~ m/$pattern/ ) { +\& print $line; +\& next LINE; +\& } +\& } +\& } +.Ve +.PP +In some cases, you may be able to make several patterns into a single +regular expression. Beware of situations that require backtracking +though. In this example, the regex is only compiled once because +\&\f(CW$regex\fR doesn't change between iterations: +.PP +.Vb 1 +\& my $regex = join \*(Aq|\*(Aq, qw( fo+ ba[rz] ); +\& +\& while( my $line = <> ) { +\& print if $line =~ m/\eb(?:$regex)\eb/i; +\& } +.Ve +.PP +The function \*(L"list2re\*(R" in Data::Munge on \s-1CPAN\s0 can also be used to form +a single regex that matches a list of literal strings (not regexes). +.PP +For more details on regular expression efficiency, see \fIMastering +Regular Expressions\fR by Jeffrey Friedl. He explains how the regular +expressions engine works and why some patterns are surprisingly +inefficient. Once you understand how perl applies regular expressions, +you can tune them for individual situations. +.ie n .SS "Why don't word-boundary searches with ""\eb"" work for me?" +.el .SS "Why don't word-boundary searches with \f(CW\eb\fP work for me?" +.IX Xref "\\b" +.IX Subsection "Why don't word-boundary searches with b work for me?" +(contributed by brian d foy) +.PP +Ensure that you know what \eb really does: it's the boundary between a +word character, \ew, and something that isn't a word character. That +thing that isn't a word character might be \eW, but it can also be the +start or end of the string. +.PP +It's not (not!) the boundary between whitespace and non-whitespace, +and it's not the stuff between words we use to create sentences. +.PP +In regex speak, a word boundary (\eb) is a \*(L"zero width assertion\*(R", +meaning that it doesn't represent a character in the string, but a +condition at a certain position. +.PP +For the regular expression, /\ebPerl\eb/, there has to be a word +boundary before the \*(L"P\*(R" and after the \*(L"l\*(R". As long as something other +than a word character precedes the \*(L"P\*(R" and succeeds the \*(L"l\*(R", the +pattern will match. These strings match /\ebPerl\eb/. +.PP +.Vb 4 +\& "Perl" # no word char before "P" or after "l" +\& "Perl " # same as previous (space is not a word char) +\& "\*(AqPerl\*(Aq" # the "\*(Aq" char is not a word char +\& "Perl\*(Aqs" # no word char before "P", non\-word char after "l" +.Ve +.PP +These strings do not match /\ebPerl\eb/. +.PP +.Vb 2 +\& "Perl_" # "_" is a word char! +\& "Perler" # no word char before "P", but one after "l" +.Ve +.PP +You don't have to use \eb to match words though. You can look for +non-word characters surrounded by word characters. These strings +match the pattern /\eb'\eb/. +.PP +.Vb 2 +\& "don\*(Aqt" # the "\*(Aq" char is surrounded by "n" and "t" +\& "qep\*(Aqa\*(Aq" # the "\*(Aq" char is surrounded by "p" and "a" +.Ve +.PP +These strings do not match /\eb'\eb/. +.PP +.Vb 1 +\& "foo\*(Aq" # there is no word char after non\-word "\*(Aq" +.Ve +.PP +You can also use the complement of \eb, \eB, to specify that there +should not be a word boundary. +.PP +In the pattern /\eBam\eB/, there must be a word character before the \*(L"a\*(R" +and after the \*(L"m\*(R". These patterns match /\eBam\eB/: +.PP +.Vb 2 +\& "llama" # "am" surrounded by word chars +\& "Samuel" # same +.Ve +.PP +These strings do not match /\eBam\eB/ +.PP +.Vb 2 +\& "Sam" # no word boundary before "a", but one after "m" +\& "I am Sam" # "am" surrounded by non\-word chars +.Ve +.SS "Why does using $&, $`, or $' slow my program down?" +.IX Xref "$MATCH $& $POSTMATCH $' $PREMATCH $`" +.IX Subsection "Why does using $&, $`, or $' slow my program down?" +(contributed by Anno Siegel) +.PP +Once Perl sees that you need one of these variables anywhere in the +program, it provides them on each and every pattern match. That means +that on every pattern match the entire string will be copied, part of it +to $`, part to $&, and part to $'. Thus the penalty is most severe with +long strings and patterns that match often. Avoid $&, $', and $` if you +can, but if you can't, once you've used them at all, use them at will +because you've already paid the price. Remember that some algorithms +really appreciate them. As of the 5.005 release, the $& variable is no +longer \*(L"expensive\*(R" the way the other two are. +.PP +Since Perl 5.6.1 the special variables @\- and @+ can functionally replace +$`, $& and $'. These arrays contain pointers to the beginning and end +of each match (see perlvar for the full story), so they give you +essentially the same information, but without the risk of excessive +string copying. +.PP +Perl 5.10 added three specials, \f(CW\*(C`${^MATCH}\*(C'\fR, \f(CW\*(C`${^PREMATCH}\*(C'\fR, and +\&\f(CW\*(C`${^POSTMATCH}\*(C'\fR to do the same job but without the global performance +penalty. Perl 5.10 only sets these variables if you compile or execute the +regular expression with the \f(CW\*(C`/p\*(C'\fR modifier. +.ie n .SS "What good is ""\eG"" in a regular expression?" +.el .SS "What good is \f(CW\eG\fP in a regular expression?" +.IX Xref "\\G" +.IX Subsection "What good is G in a regular expression?" +You use the \f(CW\*(C`\eG\*(C'\fR anchor to start the next match on the same +string where the last match left off. The regular +expression engine cannot skip over any characters to find +the next match with this anchor, so \f(CW\*(C`\eG\*(C'\fR is similar to the +beginning of string anchor, \f(CW\*(C`^\*(C'\fR. The \f(CW\*(C`\eG\*(C'\fR anchor is typically +used with the \f(CW\*(C`g\*(C'\fR modifier. It uses the value of \f(CW\*(C`pos()\*(C'\fR +as the position to start the next match. As the match +operator makes successive matches, it updates \f(CW\*(C`pos()\*(C'\fR with the +position of the next character past the last match (or the +first character of the next match, depending on how you like +to look at it). Each string has its own \f(CW\*(C`pos()\*(C'\fR value. +.PP +Suppose you want to match all of consecutive pairs of digits +in a string like \*(L"1122a44\*(R" and stop matching when you +encounter non-digits. You want to match \f(CW11\fR and \f(CW22\fR but +the letter \f(CW\*(C`a\*(C'\fR shows up between \f(CW22\fR and \f(CW44\fR and you want +to stop at \f(CW\*(C`a\*(C'\fR. Simply matching pairs of digits skips over +the \f(CW\*(C`a\*(C'\fR and still matches \f(CW44\fR. +.PP +.Vb 2 +\& $_ = "1122a44"; +\& my @pairs = m/(\ed\ed)/g; # qw( 11 22 44 ) +.Ve +.PP +If you use the \f(CW\*(C`\eG\*(C'\fR anchor, you force the match after \f(CW22\fR to +start with the \f(CW\*(C`a\*(C'\fR. The regular expression cannot match +there since it does not find a digit, so the next match +fails and the match operator returns the pairs it already +found. +.PP +.Vb 2 +\& $_ = "1122a44"; +\& my @pairs = m/\eG(\ed\ed)/g; # qw( 11 22 ) +.Ve +.PP +You can also use the \f(CW\*(C`\eG\*(C'\fR anchor in scalar context. You +still need the \f(CW\*(C`g\*(C'\fR modifier. +.PP +.Vb 4 +\& $_ = "1122a44"; +\& while( m/\eG(\ed\ed)/g ) { +\& print "Found $1\en"; +\& } +.Ve +.PP +After the match fails at the letter \f(CW\*(C`a\*(C'\fR, perl resets \f(CW\*(C`pos()\*(C'\fR +and the next match on the same string starts at the beginning. +.PP +.Vb 4 +\& $_ = "1122a44"; +\& while( m/\eG(\ed\ed)/g ) { +\& print "Found $1\en"; +\& } +\& +\& print "Found $1 after while" if m/(\ed\ed)/g; # finds "11" +.Ve +.PP +You can disable \f(CW\*(C`pos()\*(C'\fR resets on fail with the \f(CW\*(C`c\*(C'\fR modifier, documented +in perlop and perlreref. Subsequent matches start where the last +successful match ended (the value of \f(CW\*(C`pos()\*(C'\fR) even if a match on the +same string has failed in the meantime. In this case, the match after +the \f(CW\*(C`while()\*(C'\fR loop starts at the \f(CW\*(C`a\*(C'\fR (where the last match stopped), +and since it does not use any anchor it can skip over the \f(CW\*(C`a\*(C'\fR to find +\&\f(CW44\fR. +.PP +.Vb 4 +\& $_ = "1122a44"; +\& while( m/\eG(\ed\ed)/gc ) { +\& print "Found $1\en"; +\& } +\& +\& print "Found $1 after while" if m/(\ed\ed)/g; # finds "44" +.Ve +.PP +Typically you use the \f(CW\*(C`\eG\*(C'\fR anchor with the \f(CW\*(C`c\*(C'\fR modifier +when you want to try a different match if one fails, +such as in a tokenizer. Jeffrey Friedl offers this example +which works in 5.004 or later. +.PP +.Vb 9 +\& while (<>) { +\& chomp; +\& PARSER: { +\& m/ \eG( \ed+\eb )/gcx && do { print "number: $1\en"; redo; }; +\& m/ \eG( \ew+ )/gcx && do { print "word: $1\en"; redo; }; +\& m/ \eG( \es+ )/gcx && do { print "space: $1\en"; redo; }; +\& m/ \eG( [^\ew\ed]+ )/gcx && do { print "other: $1\en"; redo; }; +\& } +\& } +.Ve +.PP +For each line, the \f(CW\*(C`PARSER\*(C'\fR loop first tries to match a series +of digits followed by a word boundary. This match has to +start at the place the last match left off (or the beginning +of the string on the first match). Since \f(CW\*(C`m/ \eG( \ed+\eb +)/gcx\*(C'\fR uses the \f(CW\*(C`c\*(C'\fR modifier, if the string does not match that +regular expression, perl does not reset \fBpos()\fR and the next +match starts at the same position to try a different +pattern. +.SS "Are Perl regexes DFAs or NFAs? Are they \s-1POSIX\s0 compliant?" +.IX Xref "DFA NFA POSIX" +.IX Subsection "Are Perl regexes DFAs or NFAs? Are they POSIX compliant?" +While it's true that Perl's regular expressions resemble the DFAs +(deterministic finite automata) of the \fBegrep\fR\|(1) program, they are in +fact implemented as NFAs (non-deterministic finite automata) to allow +backtracking and backreferencing. And they aren't POSIX-style either, +because those guarantee worst-case behavior for all cases. (It seems +that some people prefer guarantees of consistency, even when what's +guaranteed is slowness.) See the book \*(L"Mastering Regular Expressions\*(R" +(from O'Reilly) by Jeffrey Friedl for all the details you could ever +hope to know on these matters (a full citation appears in +perlfaq2). +.SS "What's wrong with using grep in a void context?" +.IX Xref "grep" +.IX Subsection "What's wrong with using grep in a void context?" +The problem is that grep builds a return list, regardless of the context. +This means you're making Perl go to the trouble of building a list that +you then just throw away. If the list is large, you waste both time and space. +If your intent is to iterate over the list, then use a for loop for this +purpose. +.PP +In perls older than 5.8.1, map suffers from this problem as well. +But since 5.8.1, this has been fixed, and map is context aware \- in void +context, no lists are constructed. +.SS "How can I match strings with multibyte characters?" +.IX Xref "regex, and multibyte characters regexp, and multibyte characters regular expression, and multibyte characters martian encoding, Martian" +.IX Subsection "How can I match strings with multibyte characters?" +Starting from Perl 5.6 Perl has had some level of multibyte character +support. Perl 5.8 or later is recommended. Supported multibyte +character repertoires include Unicode, and legacy encodings +through the Encode module. See perluniintro, perlunicode, +and Encode. +.PP +If you are stuck with older Perls, you can do Unicode with the +Unicode::String module, and character conversions using the +Unicode::Map8 and Unicode::Map modules. If you are using +Japanese encodings, you might try using the jperl 5.005_03. +.PP +Finally, the following set of approaches was offered by Jeffrey +Friedl, whose article in issue #5 of The Perl Journal talks about +this very matter. +.PP +Let's suppose you have some weird Martian encoding where pairs of +\&\s-1ASCII\s0 uppercase letters encode single Martian letters (i.e. the two +bytes \*(L"\s-1CV\*(R"\s0 make a single Martian letter, as do the two bytes \*(L"\s-1SG\*(R", +\&\*(L"VS\*(R", \*(L"XX\*(R",\s0 etc.). Other bytes represent single characters, just like +\&\s-1ASCII.\s0 +.PP +So, the string of Martian \*(L"I am \s-1CVSGXX\s0!\*(R" uses 12 bytes to encode the +nine characters 'I', ' ', 'a', 'm', ' ', '\s-1CV\s0', '\s-1SG\s0', '\s-1XX\s0', '!'. +.PP +Now, say you want to search for the single character \f(CW\*(C`/GX/\*(C'\fR. Perl +doesn't know about Martian, so it'll find the two bytes \*(L"\s-1GX\*(R"\s0 in the \*(L"I +am \s-1CVSGXX\s0!\*(R" string, even though that character isn't there: it just +looks like it is because \*(L"\s-1SG\*(R"\s0 is next to \*(L"\s-1XX\*(R",\s0 but there's no real +\&\*(L"\s-1GX\*(R".\s0 This is a big problem. +.PP +Here are a few ways, all painful, to deal with it: +.PP +.Vb 2 +\& # Make sure adjacent "martian" bytes are no longer adjacent. +\& $martian =~ s/([A\-Z][A\-Z])/ $1 /g; +\& +\& print "found GX!\en" if $martian =~ /GX/; +.Ve +.PP +Or like this: +.PP +.Vb 6 +\& my @chars = $martian =~ m/([A\-Z][A\-Z]|[^A\-Z])/g; +\& # above is conceptually similar to: my @chars = $text =~ m/(.)/g; +\& # +\& foreach my $char (@chars) { +\& print "found GX!\en", last if $char eq \*(AqGX\*(Aq; +\& } +.Ve +.PP +Or like this: +.PP +.Vb 6 +\& while ($martian =~ m/\eG([A\-Z][A\-Z]|.)/gs) { # \eG probably unneeded +\& if ($1 eq \*(AqGX\*(Aq) { +\& print "found GX!\en"; +\& last; +\& } +\& } +.Ve +.PP +Here's another, slightly less painful, way to do it from Benjamin +Goldberg, who uses a zero-width negative look-behind assertion. +.PP +.Vb 5 +\& print "found GX!\en" if $martian =~ m/ +\& (?<![A\-Z]) +\& (?:[A\-Z][A\-Z])*? +\& GX +\& /x; +.Ve +.PP +This succeeds if the \*(L"martian\*(R" character \s-1GX\s0 is in the string, and fails +otherwise. If you don't like using (?<!), a zero-width negative +look-behind assertion, you can replace (?<![A\-Z]) with (?:^|[^A\-Z]). +.PP +It does have the drawback of putting the wrong thing in $\-[0] and $+[0], +but this usually can be worked around. +.SS "How do I match a regular expression that's in a variable?" +.IX Xref "regex, in variable eval regex quotemeta \\Q, regex \\E, regex qr" +.IX Subsection "How do I match a regular expression that's in a variable?" +(contributed by brian d foy) +.PP +We don't have to hard-code patterns into the match operator (or +anything else that works with regular expressions). We can put the +pattern in a variable for later use. +.PP +The match operator is a double quote context, so you can interpolate +your variable just like a double quoted string. In this case, you +read the regular expression as user input and store it in \f(CW$regex\fR. +Once you have the pattern in \f(CW$regex\fR, you use that variable in the +match operator. +.PP +.Vb 1 +\& chomp( my $regex = <STDIN> ); +\& +\& if( $string =~ m/$regex/ ) { ... } +.Ve +.PP +Any regular expression special characters in \f(CW$regex\fR are still +special, and the pattern still has to be valid or Perl will complain. +For instance, in this pattern there is an unpaired parenthesis. +.PP +.Vb 1 +\& my $regex = "Unmatched ( paren"; +\& +\& "Two parens to bind them all" =~ m/$regex/; +.Ve +.PP +When Perl compiles the regular expression, it treats the parenthesis +as the start of a memory match. When it doesn't find the closing +parenthesis, it complains: +.PP +.Vb 1 +\& Unmatched ( in regex; marked by <\-\- HERE in m/Unmatched ( <\-\- HERE paren/ at script line 3. +.Ve +.PP +You can get around this in several ways depending on our situation. +First, if you don't want any of the characters in the string to be +special, you can escape them with \f(CW\*(C`quotemeta\*(C'\fR before you use the string. +.PP +.Vb 2 +\& chomp( my $regex = <STDIN> ); +\& $regex = quotemeta( $regex ); +\& +\& if( $string =~ m/$regex/ ) { ... } +.Ve +.PP +You can also do this directly in the match operator using the \f(CW\*(C`\eQ\*(C'\fR +and \f(CW\*(C`\eE\*(C'\fR sequences. The \f(CW\*(C`\eQ\*(C'\fR tells Perl where to start escaping +special characters, and the \f(CW\*(C`\eE\*(C'\fR tells it where to stop (see perlop +for more details). +.PP +.Vb 1 +\& chomp( my $regex = <STDIN> ); +\& +\& if( $string =~ m/\eQ$regex\eE/ ) { ... } +.Ve +.PP +Alternately, you can use \f(CW\*(C`qr//\*(C'\fR, the regular expression quote operator (see +perlop for more details). It quotes and perhaps compiles the pattern, +and you can apply regular expression flags to the pattern. +.PP +.Vb 1 +\& chomp( my $input = <STDIN> ); +\& +\& my $regex = qr/$input/is; +\& +\& $string =~ m/$regex/ # same as m/$input/is; +.Ve +.PP +You might also want to trap any errors by wrapping an \f(CW\*(C`eval\*(C'\fR block +around the whole thing. +.PP +.Vb 1 +\& chomp( my $input = <STDIN> ); +\& +\& eval { +\& if( $string =~ m/\eQ$input\eE/ ) { ... } +\& }; +\& warn $@ if $@; +.Ve +.PP +Or... +.PP +.Vb 7 +\& my $regex = eval { qr/$input/is }; +\& if( defined $regex ) { +\& $string =~ m/$regex/; +\& } +\& else { +\& warn $@; +\& } +.Ve +.SH "AUTHOR AND COPYRIGHT" +.IX Header "AUTHOR AND COPYRIGHT" +Copyright (c) 1997\-2010 Tom Christiansen, Nathan Torkington, and +other authors as noted. All rights reserved. +.PP +This documentation is free; you can redistribute it and/or modify it +under the same terms as Perl itself. +.PP +Irrespective of its distribution, all code examples in this file +are hereby placed into the public domain. You are permitted and +encouraged to use this code in your own programs for fun +or for profit as you see fit. A simple comment in the code giving +credit would be courteous but is not required. |